diff --git a/wolfcrypt/src/aes_gcm_asm.S b/wolfcrypt/src/aes_gcm_asm.S
index 053f8769539..e75f2c9b942 100644
--- a/wolfcrypt/src/aes_gcm_asm.S
+++ b/wolfcrypt/src/aes_gcm_asm.S
@@ -59,7 +59,7 @@
 .p2align	4
 #endif /* __APPLE__ */
 L_GCM_generate_m0_aesni_rev8:
-.quad	0x8090a0b0c0d0e0f, 0x1020304050607
+.quad	0x08090a0b0c0d0e0f,0x0001020304050607
 #ifndef __APPLE__
 .data
 #else
@@ -71,7 +71,7 @@ L_GCM_generate_m0_aesni_rev8:
 .p2align	4
 #endif /* __APPLE__ */
 L_GCM_generate_m0_aesni_mod2_128:
-.quad	0x0, 0xe100000000000000
+.quad	0x0000000000000000,0xe100000000000000
 #ifndef __APPLE__
 .text
 .globl	GCM_generate_m0_aesni
@@ -325,7 +325,7 @@ _GCM_generate_m0_aesni:
 .p2align	4
 #endif /* __APPLE__ */
 L_aes_gcm_one:
-.quad	0x0, 0x1
+.quad	0x0000000000000000,0x0000000000000001
 #ifndef __APPLE__
 .data
 #else
@@ -337,7 +337,7 @@ L_aes_gcm_one:
 .p2align	4
 #endif /* __APPLE__ */
 L_aes_gcm_two:
-.quad	0x0, 0x2
+.quad	0x0000000000000000,0x0000000000000002
 #ifndef __APPLE__
 .data
 #else
@@ -349,7 +349,7 @@ L_aes_gcm_two:
 .p2align	4
 #endif /* __APPLE__ */
 L_aes_gcm_three:
-.quad	0x0, 0x3
+.quad	0x0000000000000000,0x0000000000000003
 #ifndef __APPLE__
 .data
 #else
@@ -361,7 +361,7 @@ L_aes_gcm_three:
 .p2align	4
 #endif /* __APPLE__ */
 L_aes_gcm_four:
-.quad	0x0, 0x4
+.quad	0x0000000000000000,0x0000000000000004
 #ifndef __APPLE__
 .data
 #else
@@ -373,7 +373,7 @@ L_aes_gcm_four:
 .p2align	4
 #endif /* __APPLE__ */
 L_aes_gcm_five:
-.quad	0x0, 0x5
+.quad	0x0000000000000000,0x0000000000000005
 #ifndef __APPLE__
 .data
 #else
@@ -385,7 +385,7 @@ L_aes_gcm_five:
 .p2align	4
 #endif /* __APPLE__ */
 L_aes_gcm_six:
-.quad	0x0, 0x6
+.quad	0x0000000000000000,0x0000000000000006
 #ifndef __APPLE__
 .data
 #else
@@ -397,7 +397,7 @@ L_aes_gcm_six:
 .p2align	4
 #endif /* __APPLE__ */
 L_aes_gcm_seven:
-.quad	0x0, 0x7
+.quad	0x0000000000000000,0x0000000000000007
 #ifndef __APPLE__
 .data
 #else
@@ -409,7 +409,7 @@ L_aes_gcm_seven:
 .p2align	4
 #endif /* __APPLE__ */
 L_aes_gcm_eight:
-.quad	0x0, 0x8
+.quad	0x0000000000000000,0x0000000000000008
 #ifndef __APPLE__
 .data
 #else
@@ -421,7 +421,7 @@ L_aes_gcm_eight:
 .p2align	4
 #endif /* __APPLE__ */
 L_aes_gcm_bswap_epi64:
-.quad	0x1020304050607, 0x8090a0b0c0d0e0f
+.quad	0x0001020304050607,0x08090a0b0c0d0e0f
 #ifndef __APPLE__
 .data
 #else
@@ -433,7 +433,7 @@ L_aes_gcm_bswap_epi64:
 .p2align	4
 #endif /* __APPLE__ */
 L_aes_gcm_bswap_mask:
-.quad	0x8090a0b0c0d0e0f, 0x1020304050607
+.quad	0x08090a0b0c0d0e0f,0x0001020304050607
 #ifndef __APPLE__
 .data
 #else
@@ -445,7 +445,7 @@ L_aes_gcm_bswap_mask:
 .p2align	4
 #endif /* __APPLE__ */
 L_aes_gcm_mod2_128:
-.quad	0x1, 0xc200000000000000
+.quad	0x0000000000000001,0xc200000000000000
 #ifndef __APPLE__
 .text
 .globl	AES_GCM_encrypt_aesni
@@ -6490,7 +6490,7 @@ L_AES_GCM_decrypt_final_aesni_cmp_tag_done:
 .p2align	4
 #endif /* __APPLE__ */
 L_GCM_generate_m0_avx1_rev8:
-.quad	0x8090a0b0c0d0e0f, 0x1020304050607
+.quad	0x08090a0b0c0d0e0f,0x0001020304050607
 #ifndef __APPLE__
 .data
 #else
@@ -6502,7 +6502,7 @@ L_GCM_generate_m0_avx1_rev8:
 .p2align	4
 #endif /* __APPLE__ */
 L_GCM_generate_m0_avx1_mod2_128:
-.quad	0x0, 0xe100000000000000
+.quad	0x0000000000000000,0xe100000000000000
 #ifndef __APPLE__
 .text
 .globl	GCM_generate_m0_avx1
@@ -6722,7 +6722,7 @@ _GCM_generate_m0_avx1:
 .p2align	4
 #endif /* __APPLE__ */
 L_avx1_aes_gcm_one:
-.quad	0x0, 0x1
+.quad	0x0000000000000000,0x0000000000000001
 #ifndef __APPLE__
 .data
 #else
@@ -6734,7 +6734,7 @@ L_avx1_aes_gcm_one:
 .p2align	4
 #endif /* __APPLE__ */
 L_avx1_aes_gcm_two:
-.quad	0x0, 0x2
+.quad	0x0000000000000000,0x0000000000000002
 #ifndef __APPLE__
 .data
 #else
@@ -6746,7 +6746,7 @@ L_avx1_aes_gcm_two:
 .p2align	4
 #endif /* __APPLE__ */
 L_avx1_aes_gcm_three:
-.quad	0x0, 0x3
+.quad	0x0000000000000000,0x0000000000000003
 #ifndef __APPLE__
 .data
 #else
@@ -6758,7 +6758,7 @@ L_avx1_aes_gcm_three:
 .p2align	4
 #endif /* __APPLE__ */
 L_avx1_aes_gcm_four:
-.quad	0x0, 0x4
+.quad	0x0000000000000000,0x0000000000000004
 #ifndef __APPLE__
 .data
 #else
@@ -6770,7 +6770,7 @@ L_avx1_aes_gcm_four:
 .p2align	4
 #endif /* __APPLE__ */
 L_avx1_aes_gcm_five:
-.quad	0x0, 0x5
+.quad	0x0000000000000000,0x0000000000000005
 #ifndef __APPLE__
 .data
 #else
@@ -6782,7 +6782,7 @@ L_avx1_aes_gcm_five:
 .p2align	4
 #endif /* __APPLE__ */
 L_avx1_aes_gcm_six:
-.quad	0x0, 0x6
+.quad	0x0000000000000000,0x0000000000000006
 #ifndef __APPLE__
 .data
 #else
@@ -6794,7 +6794,7 @@ L_avx1_aes_gcm_six:
 .p2align	4
 #endif /* __APPLE__ */
 L_avx1_aes_gcm_seven:
-.quad	0x0, 0x7
+.quad	0x0000000000000000,0x0000000000000007
 #ifndef __APPLE__
 .data
 #else
@@ -6806,7 +6806,7 @@ L_avx1_aes_gcm_seven:
 .p2align	4
 #endif /* __APPLE__ */
 L_avx1_aes_gcm_eight:
-.quad	0x0, 0x8
+.quad	0x0000000000000000,0x0000000000000008
 #ifndef __APPLE__
 .data
 #else
@@ -6818,7 +6818,7 @@ L_avx1_aes_gcm_eight:
 .p2align	4
 #endif /* __APPLE__ */
 L_avx1_aes_gcm_bswap_epi64:
-.quad	0x1020304050607, 0x8090a0b0c0d0e0f
+.quad	0x0001020304050607,0x08090a0b0c0d0e0f
 #ifndef __APPLE__
 .data
 #else
@@ -6830,7 +6830,7 @@ L_avx1_aes_gcm_bswap_epi64:
 .p2align	4
 #endif /* __APPLE__ */
 L_avx1_aes_gcm_bswap_mask:
-.quad	0x8090a0b0c0d0e0f, 0x1020304050607
+.quad	0x08090a0b0c0d0e0f,0x0001020304050607
 #ifndef __APPLE__
 .data
 #else
@@ -6842,7 +6842,7 @@ L_avx1_aes_gcm_bswap_mask:
 .p2align	4
 #endif /* __APPLE__ */
 L_avx1_aes_gcm_mod2_128:
-.quad	0x1, 0xc200000000000000
+.quad	0x0000000000000001,0xc200000000000000
 #ifndef __APPLE__
 .text
 .globl	AES_GCM_encrypt_avx1
@@ -11953,7 +11953,7 @@ L_AES_GCM_decrypt_final_avx1_cmp_tag_done:
 .p2align	4
 #endif /* __APPLE__ */
 L_GCM_generate_m0_avx2_rev8:
-.quad	0x8090a0b0c0d0e0f, 0x1020304050607
+.quad	0x08090a0b0c0d0e0f,0x0001020304050607
 #ifndef __APPLE__
 .data
 #else
@@ -11965,7 +11965,7 @@ L_GCM_generate_m0_avx2_rev8:
 .p2align	4
 #endif /* __APPLE__ */
 L_GCM_generate_m0_avx2_mod2_128:
-.quad	0x0, 0xe100000000000000
+.quad	0x0000000000000000,0xe100000000000000
 #ifndef __APPLE__
 .text
 .globl	GCM_generate_m0_avx2
@@ -12185,7 +12185,7 @@ _GCM_generate_m0_avx2:
 .p2align	4
 #endif /* __APPLE__ */
 L_avx2_aes_gcm_one:
-.quad	0x0, 0x1
+.quad	0x0000000000000000,0x0000000000000001
 #ifndef __APPLE__
 .data
 #else
@@ -12197,7 +12197,7 @@ L_avx2_aes_gcm_one:
 .p2align	4
 #endif /* __APPLE__ */
 L_avx2_aes_gcm_two:
-.quad	0x0, 0x2
+.quad	0x0000000000000000,0x0000000000000002
 #ifndef __APPLE__
 .data
 #else
@@ -12209,7 +12209,7 @@ L_avx2_aes_gcm_two:
 .p2align	4
 #endif /* __APPLE__ */
 L_avx2_aes_gcm_three:
-.quad	0x0, 0x3
+.quad	0x0000000000000000,0x0000000000000003
 #ifndef __APPLE__
 .data
 #else
@@ -12221,7 +12221,7 @@ L_avx2_aes_gcm_three:
 .p2align	4
 #endif /* __APPLE__ */
 L_avx2_aes_gcm_four:
-.quad	0x0, 0x4
+.quad	0x0000000000000000,0x0000000000000004
 #ifndef __APPLE__
 .data
 #else
@@ -12233,7 +12233,7 @@ L_avx2_aes_gcm_four:
 .p2align	4
 #endif /* __APPLE__ */
 L_avx2_aes_gcm_five:
-.quad	0x0, 0x5
+.quad	0x0000000000000000,0x0000000000000005
 #ifndef __APPLE__
 .data
 #else
@@ -12245,7 +12245,7 @@ L_avx2_aes_gcm_five:
 .p2align	4
 #endif /* __APPLE__ */
 L_avx2_aes_gcm_six:
-.quad	0x0, 0x6
+.quad	0x0000000000000000,0x0000000000000006
 #ifndef __APPLE__
 .data
 #else
@@ -12257,7 +12257,7 @@ L_avx2_aes_gcm_six:
 .p2align	4
 #endif /* __APPLE__ */
 L_avx2_aes_gcm_seven:
-.quad	0x0, 0x7
+.quad	0x0000000000000000,0x0000000000000007
 #ifndef __APPLE__
 .data
 #else
@@ -12269,7 +12269,7 @@ L_avx2_aes_gcm_seven:
 .p2align	4
 #endif /* __APPLE__ */
 L_avx2_aes_gcm_eight:
-.quad	0x0, 0x8
+.quad	0x0000000000000000,0x0000000000000008
 #ifndef __APPLE__
 .data
 #else
@@ -12281,7 +12281,7 @@ L_avx2_aes_gcm_eight:
 .p2align	4
 #endif /* __APPLE__ */
 L_avx2_aes_gcm_bswap_one:
-.quad	0x0, 0x100000000000000
+.quad	0x0000000000000000,0x0100000000000000
 #ifndef __APPLE__
 .data
 #else
@@ -12293,7 +12293,7 @@ L_avx2_aes_gcm_bswap_one:
 .p2align	4
 #endif /* __APPLE__ */
 L_avx2_aes_gcm_bswap_epi64:
-.quad	0x1020304050607, 0x8090a0b0c0d0e0f
+.quad	0x0001020304050607,0x08090a0b0c0d0e0f
 #ifndef __APPLE__
 .data
 #else
@@ -12305,7 +12305,7 @@ L_avx2_aes_gcm_bswap_epi64:
 .p2align	4
 #endif /* __APPLE__ */
 L_avx2_aes_gcm_bswap_mask:
-.quad	0x8090a0b0c0d0e0f, 0x1020304050607
+.quad	0x08090a0b0c0d0e0f,0x0001020304050607
 #ifndef __APPLE__
 .data
 #else
@@ -12317,7 +12317,7 @@ L_avx2_aes_gcm_bswap_mask:
 .p2align	4
 #endif /* __APPLE__ */
 L_avx2_aes_gcm_mod2_128:
-.quad	0x1, 0xc200000000000000
+.quad	0x0000000000000001,0xc200000000000000
 #ifndef __APPLE__
 .text
 .globl	AES_GCM_encrypt_avx2
diff --git a/wolfcrypt/src/aes_gcm_asm.asm b/wolfcrypt/src/aes_gcm_asm.asm
index 61eb671bb2e..d222bc14478 100644
--- a/wolfcrypt/src/aes_gcm_asm.asm
+++ b/wolfcrypt/src/aes_gcm_asm.asm
@@ -18,6 +18,7 @@
 ;  * along with this program; if not, write to the Free Software
 ;  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
 ;  */
+
 IF @Version LT 1200
 ; AVX2 instructions not recognized by old versions of MASM
 IFNDEF NO_AVX2_SUPPORT
@@ -42,15 +43,17 @@ ENDIF
 
 _DATA SEGMENT
 ALIGN 16
-L_GCM_generate_m0_aesni_rev8 QWORD 579005069656919567, 283686952306183
+L_GCM_generate_m0_aesni_rev8 QWORD \
+     08090a0b0c0d0e0fh,  0001020304050607h
 ptr_L_GCM_generate_m0_aesni_rev8 QWORD L_GCM_generate_m0_aesni_rev8
 _DATA ENDS
 _DATA SEGMENT
 ALIGN 16
-L_GCM_generate_m0_aesni_mod2_128 QWORD 0, 16212958658533785600
+L_GCM_generate_m0_aesni_mod2_128 QWORD \
+     0000000000000000h, 0e100000000000000h
 ptr_L_GCM_generate_m0_aesni_mod2_128 QWORD L_GCM_generate_m0_aesni_mod2_128
 _DATA ENDS
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 GCM_generate_m0_aesni PROC
         sub	rsp, 80
         movdqu	OWORD PTR [rsp], xmm6
@@ -292,63 +295,74 @@ GCM_generate_m0_aesni PROC
         add	rsp, 80
         ret
 GCM_generate_m0_aesni ENDP
-_text ENDS
+_TEXT ENDS
 _DATA SEGMENT
 ALIGN 16
-L_aes_gcm_one QWORD 0, 1
+L_aes_gcm_one QWORD \
+     0000000000000000h,  0000000000000001h
 ptr_L_aes_gcm_one QWORD L_aes_gcm_one
 _DATA ENDS
 _DATA SEGMENT
 ALIGN 16
-L_aes_gcm_two QWORD 0, 2
+L_aes_gcm_two QWORD \
+     0000000000000000h,  0000000000000002h
 ptr_L_aes_gcm_two QWORD L_aes_gcm_two
 _DATA ENDS
 _DATA SEGMENT
 ALIGN 16
-L_aes_gcm_three QWORD 0, 3
+L_aes_gcm_three QWORD \
+     0000000000000000h,  0000000000000003h
 ptr_L_aes_gcm_three QWORD L_aes_gcm_three
 _DATA ENDS
 _DATA SEGMENT
 ALIGN 16
-L_aes_gcm_four QWORD 0, 4
+L_aes_gcm_four QWORD \
+     0000000000000000h,  0000000000000004h
 ptr_L_aes_gcm_four QWORD L_aes_gcm_four
 _DATA ENDS
 _DATA SEGMENT
 ALIGN 16
-L_aes_gcm_five QWORD 0, 5
+L_aes_gcm_five QWORD \
+     0000000000000000h,  0000000000000005h
 ptr_L_aes_gcm_five QWORD L_aes_gcm_five
 _DATA ENDS
 _DATA SEGMENT
 ALIGN 16
-L_aes_gcm_six QWORD 0, 6
+L_aes_gcm_six QWORD \
+     0000000000000000h,  0000000000000006h
 ptr_L_aes_gcm_six QWORD L_aes_gcm_six
 _DATA ENDS
 _DATA SEGMENT
 ALIGN 16
-L_aes_gcm_seven QWORD 0, 7
+L_aes_gcm_seven QWORD \
+     0000000000000000h,  0000000000000007h
 ptr_L_aes_gcm_seven QWORD L_aes_gcm_seven
 _DATA ENDS
 _DATA SEGMENT
 ALIGN 16
-L_aes_gcm_eight QWORD 0, 8
+L_aes_gcm_eight QWORD \
+     0000000000000000h,  0000000000000008h
 ptr_L_aes_gcm_eight QWORD L_aes_gcm_eight
 _DATA ENDS
 _DATA SEGMENT
 ALIGN 16
-L_aes_gcm_bswap_epi64 QWORD 283686952306183, 579005069656919567
+L_aes_gcm_bswap_epi64 QWORD \
+     0001020304050607h,  08090a0b0c0d0e0fh
 ptr_L_aes_gcm_bswap_epi64 QWORD L_aes_gcm_bswap_epi64
 _DATA ENDS
 _DATA SEGMENT
 ALIGN 16
-L_aes_gcm_bswap_mask QWORD 579005069656919567, 283686952306183
+L_aes_gcm_bswap_mask QWORD \
+     08090a0b0c0d0e0fh,  0001020304050607h
 ptr_L_aes_gcm_bswap_mask QWORD L_aes_gcm_bswap_mask
 _DATA ENDS
 _DATA SEGMENT
 ALIGN 16
-L_aes_gcm_mod2_128 QWORD 1, 13979173243358019584
+L_aes_gcm_mod2_128 QWORD \
+     0000000000000001h, 0c200000000000000h
 ptr_L_aes_gcm_mod2_128 QWORD L_aes_gcm_mod2_128
 _DATA ENDS
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 AES_GCM_encrypt_aesni PROC
         push	r13
         push	rdi
@@ -2218,8 +2232,8 @@ L_AES_GCM_encrypt_aesni_store_tag_done:
         pop	r13
         ret
 AES_GCM_encrypt_aesni ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
 AES_GCM_decrypt_aesni PROC
         push	r13
         push	rdi
@@ -3641,8 +3655,8 @@ L_AES_GCM_decrypt_aesni_cmp_tag_done:
         pop	r13
         ret
 AES_GCM_decrypt_aesni ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
 AES_GCM_init_aesni PROC
         push	rdi
         push	rsi
@@ -3999,8 +4013,8 @@ L_AES_GCM_init_aesni_iv_done:
         pop	rdi
         ret
 AES_GCM_init_aesni ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
 AES_GCM_aad_update_aesni PROC
         mov	rax, rcx
         sub	rsp, 32
@@ -4076,8 +4090,8 @@ L_AES_GCM_aad_update_aesni_16_loop:
         add	rsp, 32
         ret
 AES_GCM_aad_update_aesni ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
 AES_GCM_encrypt_block_aesni PROC
         mov	r10, r8
         mov	r11, r9
@@ -4116,8 +4130,8 @@ L_AES_GCM_encrypt_block_aesni_aesenc_block_aesenc_avx_last:
         pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
         ret
 AES_GCM_encrypt_block_aesni ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
 AES_GCM_ghash_block_aesni PROC
         sub	rsp, 32
         movdqu	OWORD PTR [rsp], xmm6
@@ -4187,8 +4201,8 @@ AES_GCM_ghash_block_aesni PROC
         add	rsp, 32
         ret
 AES_GCM_ghash_block_aesni ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
 AES_GCM_encrypt_update_aesni PROC
         push	r13
         push	r12
@@ -5426,8 +5440,8 @@ L_AES_GCM_encrypt_update_aesni_done_enc:
         pop	r13
         ret
 AES_GCM_encrypt_update_aesni ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
 AES_GCM_encrypt_final_aesni PROC
         push	r13
         push	r12
@@ -5538,8 +5552,8 @@ L_AES_GCM_encrypt_final_aesni_store_tag_done:
         pop	r13
         ret
 AES_GCM_encrypt_final_aesni ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
 AES_GCM_decrypt_update_aesni PROC
         push	r13
         push	r12
@@ -6321,8 +6335,8 @@ L_AES_GCM_decrypt_update_aesni_done_dec:
         pop	r13
         ret
 AES_GCM_decrypt_update_aesni ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
 AES_GCM_decrypt_final_aesni PROC
         push	r13
         push	r12
@@ -6454,19 +6468,21 @@ L_AES_GCM_decrypt_final_aesni_cmp_tag_done:
         pop	r13
         ret
 AES_GCM_decrypt_final_aesni ENDP
-_text ENDS
+_TEXT ENDS
 IFDEF HAVE_INTEL_AVX1
 _DATA SEGMENT
 ALIGN 16
-L_GCM_generate_m0_avx1_rev8 QWORD 579005069656919567, 283686952306183
+L_GCM_generate_m0_avx1_rev8 QWORD \
+     08090a0b0c0d0e0fh,  0001020304050607h
 ptr_L_GCM_generate_m0_avx1_rev8 QWORD L_GCM_generate_m0_avx1_rev8
 _DATA ENDS
 _DATA SEGMENT
 ALIGN 16
-L_GCM_generate_m0_avx1_mod2_128 QWORD 0, 16212958658533785600
+L_GCM_generate_m0_avx1_mod2_128 QWORD \
+     0000000000000000h, 0e100000000000000h
 ptr_L_GCM_generate_m0_avx1_mod2_128 QWORD L_GCM_generate_m0_avx1_mod2_128
 _DATA ENDS
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 GCM_generate_m0_avx1 PROC
         sub	rsp, 80
         vmovdqu	OWORD PTR [rsp], xmm6
@@ -6674,63 +6690,74 @@ GCM_generate_m0_avx1 PROC
         add	rsp, 80
         ret
 GCM_generate_m0_avx1 ENDP
-_text ENDS
+_TEXT ENDS
 _DATA SEGMENT
 ALIGN 16
-L_avx1_aes_gcm_one QWORD 0, 1
+L_avx1_aes_gcm_one QWORD \
+     0000000000000000h,  0000000000000001h
 ptr_L_avx1_aes_gcm_one QWORD L_avx1_aes_gcm_one
 _DATA ENDS
 _DATA SEGMENT
 ALIGN 16
-L_avx1_aes_gcm_two QWORD 0, 2
+L_avx1_aes_gcm_two QWORD \
+     0000000000000000h,  0000000000000002h
 ptr_L_avx1_aes_gcm_two QWORD L_avx1_aes_gcm_two
 _DATA ENDS
 _DATA SEGMENT
 ALIGN 16
-L_avx1_aes_gcm_three QWORD 0, 3
+L_avx1_aes_gcm_three QWORD \
+     0000000000000000h,  0000000000000003h
 ptr_L_avx1_aes_gcm_three QWORD L_avx1_aes_gcm_three
 _DATA ENDS
 _DATA SEGMENT
 ALIGN 16
-L_avx1_aes_gcm_four QWORD 0, 4
+L_avx1_aes_gcm_four QWORD \
+     0000000000000000h,  0000000000000004h
 ptr_L_avx1_aes_gcm_four QWORD L_avx1_aes_gcm_four
 _DATA ENDS
 _DATA SEGMENT
 ALIGN 16
-L_avx1_aes_gcm_five QWORD 0, 5
+L_avx1_aes_gcm_five QWORD \
+     0000000000000000h,  0000000000000005h
 ptr_L_avx1_aes_gcm_five QWORD L_avx1_aes_gcm_five
 _DATA ENDS
 _DATA SEGMENT
 ALIGN 16
-L_avx1_aes_gcm_six QWORD 0, 6
+L_avx1_aes_gcm_six QWORD \
+     0000000000000000h,  0000000000000006h
 ptr_L_avx1_aes_gcm_six QWORD L_avx1_aes_gcm_six
 _DATA ENDS
 _DATA SEGMENT
 ALIGN 16
-L_avx1_aes_gcm_seven QWORD 0, 7
+L_avx1_aes_gcm_seven QWORD \
+     0000000000000000h,  0000000000000007h
 ptr_L_avx1_aes_gcm_seven QWORD L_avx1_aes_gcm_seven
 _DATA ENDS
 _DATA SEGMENT
 ALIGN 16
-L_avx1_aes_gcm_eight QWORD 0, 8
+L_avx1_aes_gcm_eight QWORD \
+     0000000000000000h,  0000000000000008h
 ptr_L_avx1_aes_gcm_eight QWORD L_avx1_aes_gcm_eight
 _DATA ENDS
 _DATA SEGMENT
 ALIGN 16
-L_avx1_aes_gcm_bswap_epi64 QWORD 283686952306183, 579005069656919567
+L_avx1_aes_gcm_bswap_epi64 QWORD \
+     0001020304050607h,  08090a0b0c0d0e0fh
 ptr_L_avx1_aes_gcm_bswap_epi64 QWORD L_avx1_aes_gcm_bswap_epi64
 _DATA ENDS
 _DATA SEGMENT
 ALIGN 16
-L_avx1_aes_gcm_bswap_mask QWORD 579005069656919567, 283686952306183
+L_avx1_aes_gcm_bswap_mask QWORD \
+     08090a0b0c0d0e0fh,  0001020304050607h
 ptr_L_avx1_aes_gcm_bswap_mask QWORD L_avx1_aes_gcm_bswap_mask
 _DATA ENDS
 _DATA SEGMENT
 ALIGN 16
-L_avx1_aes_gcm_mod2_128 QWORD 1, 13979173243358019584
+L_avx1_aes_gcm_mod2_128 QWORD \
+     0000000000000001h, 0c200000000000000h
 ptr_L_avx1_aes_gcm_mod2_128 QWORD L_avx1_aes_gcm_mod2_128
 _DATA ENDS
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 AES_GCM_encrypt_avx1 PROC
         push	r13
         push	rdi
@@ -8328,8 +8355,8 @@ L_AES_GCM_encrypt_avx1_store_tag_done:
         pop	r13
         ret
 AES_GCM_encrypt_avx1 ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
 AES_GCM_decrypt_avx1 PROC
         push	r13
         push	rdi
@@ -9521,8 +9548,8 @@ L_AES_GCM_decrypt_avx1_cmp_tag_done:
         pop	r13
         ret
 AES_GCM_decrypt_avx1 ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
 AES_GCM_init_avx1 PROC
         push	rdi
         push	rsi
@@ -9843,8 +9870,8 @@ L_AES_GCM_init_avx1_iv_done:
         pop	rdi
         ret
 AES_GCM_init_avx1 ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
 AES_GCM_aad_update_avx1 PROC
         mov	rax, rcx
         sub	rsp, 32
@@ -9909,8 +9936,8 @@ L_AES_GCM_aad_update_avx1_16_loop:
         add	rsp, 32
         ret
 AES_GCM_aad_update_avx1 ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
 AES_GCM_encrypt_block_avx1 PROC
         mov	r10, r8
         mov	r11, r9
@@ -9949,8 +9976,8 @@ L_AES_GCM_encrypt_block_avx1_aesenc_block_last:
         vzeroupper
         ret
 AES_GCM_encrypt_block_avx1 ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
 AES_GCM_ghash_block_avx1 PROC
         sub	rsp, 32
         vmovdqu	OWORD PTR [rsp], xmm6
@@ -10010,8 +10037,8 @@ AES_GCM_ghash_block_avx1 PROC
         add	rsp, 32
         ret
 AES_GCM_ghash_block_avx1 ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
 AES_GCM_encrypt_update_avx1 PROC
         push	r13
         push	r12
@@ -11052,8 +11079,8 @@ L_AES_GCM_encrypt_update_avx1_done_enc:
         pop	r13
         ret
 AES_GCM_encrypt_update_avx1 ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
 AES_GCM_encrypt_final_avx1 PROC
         push	r13
         push	r12
@@ -11153,8 +11180,8 @@ L_AES_GCM_encrypt_final_avx1_store_tag_done:
         pop	r13
         ret
 AES_GCM_encrypt_final_avx1 ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
 AES_GCM_decrypt_update_avx1 PROC
         push	r13
         push	r12
@@ -11779,8 +11806,8 @@ L_AES_GCM_decrypt_update_avx1_done_dec:
         pop	r13
         ret
 AES_GCM_decrypt_update_avx1 ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
 AES_GCM_decrypt_final_avx1 PROC
         push	r13
         push	r12
@@ -11901,20 +11928,22 @@ L_AES_GCM_decrypt_final_avx1_cmp_tag_done:
         pop	r13
         ret
 AES_GCM_decrypt_final_avx1 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 IFDEF HAVE_INTEL_AVX2
 _DATA SEGMENT
 ALIGN 16
-L_GCM_generate_m0_avx2_rev8 QWORD 579005069656919567, 283686952306183
+L_GCM_generate_m0_avx2_rev8 QWORD \
+     08090a0b0c0d0e0fh,  0001020304050607h
 ptr_L_GCM_generate_m0_avx2_rev8 QWORD L_GCM_generate_m0_avx2_rev8
 _DATA ENDS
 _DATA SEGMENT
 ALIGN 16
-L_GCM_generate_m0_avx2_mod2_128 QWORD 0, 16212958658533785600
+L_GCM_generate_m0_avx2_mod2_128 QWORD \
+     0000000000000000h, 0e100000000000000h
 ptr_L_GCM_generate_m0_avx2_mod2_128 QWORD L_GCM_generate_m0_avx2_mod2_128
 _DATA ENDS
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 GCM_generate_m0_avx2 PROC
         sub	rsp, 80
         vmovdqu	OWORD PTR [rsp], xmm6
@@ -12122,68 +12151,80 @@ GCM_generate_m0_avx2 PROC
         add	rsp, 80
         ret
 GCM_generate_m0_avx2 ENDP
-_text ENDS
+_TEXT ENDS
 _DATA SEGMENT
 ALIGN 16
-L_avx2_aes_gcm_one QWORD 0, 1
+L_avx2_aes_gcm_one QWORD \
+     0000000000000000h,  0000000000000001h
 ptr_L_avx2_aes_gcm_one QWORD L_avx2_aes_gcm_one
 _DATA ENDS
 _DATA SEGMENT
 ALIGN 16
-L_avx2_aes_gcm_two QWORD 0, 2
+L_avx2_aes_gcm_two QWORD \
+     0000000000000000h,  0000000000000002h
 ptr_L_avx2_aes_gcm_two QWORD L_avx2_aes_gcm_two
 _DATA ENDS
 _DATA SEGMENT
 ALIGN 16
-L_avx2_aes_gcm_three QWORD 0, 3
+L_avx2_aes_gcm_three QWORD \
+     0000000000000000h,  0000000000000003h
 ptr_L_avx2_aes_gcm_three QWORD L_avx2_aes_gcm_three
 _DATA ENDS
 _DATA SEGMENT
 ALIGN 16
-L_avx2_aes_gcm_four QWORD 0, 4
+L_avx2_aes_gcm_four QWORD \
+     0000000000000000h,  0000000000000004h
 ptr_L_avx2_aes_gcm_four QWORD L_avx2_aes_gcm_four
 _DATA ENDS
 _DATA SEGMENT
 ALIGN 16
-L_avx2_aes_gcm_five QWORD 0, 5
+L_avx2_aes_gcm_five QWORD \
+     0000000000000000h,  0000000000000005h
 ptr_L_avx2_aes_gcm_five QWORD L_avx2_aes_gcm_five
 _DATA ENDS
 _DATA SEGMENT
 ALIGN 16
-L_avx2_aes_gcm_six QWORD 0, 6
+L_avx2_aes_gcm_six QWORD \
+     0000000000000000h,  0000000000000006h
 ptr_L_avx2_aes_gcm_six QWORD L_avx2_aes_gcm_six
 _DATA ENDS
 _DATA SEGMENT
 ALIGN 16
-L_avx2_aes_gcm_seven QWORD 0, 7
+L_avx2_aes_gcm_seven QWORD \
+     0000000000000000h,  0000000000000007h
 ptr_L_avx2_aes_gcm_seven QWORD L_avx2_aes_gcm_seven
 _DATA ENDS
 _DATA SEGMENT
 ALIGN 16
-L_avx2_aes_gcm_eight QWORD 0, 8
+L_avx2_aes_gcm_eight QWORD \
+     0000000000000000h,  0000000000000008h
 ptr_L_avx2_aes_gcm_eight QWORD L_avx2_aes_gcm_eight
 _DATA ENDS
 _DATA SEGMENT
 ALIGN 16
-L_avx2_aes_gcm_bswap_one QWORD 0, 72057594037927936
+L_avx2_aes_gcm_bswap_one QWORD \
+     0000000000000000h,  0100000000000000h
 ptr_L_avx2_aes_gcm_bswap_one QWORD L_avx2_aes_gcm_bswap_one
 _DATA ENDS
 _DATA SEGMENT
 ALIGN 16
-L_avx2_aes_gcm_bswap_epi64 QWORD 283686952306183, 579005069656919567
+L_avx2_aes_gcm_bswap_epi64 QWORD \
+     0001020304050607h,  08090a0b0c0d0e0fh
 ptr_L_avx2_aes_gcm_bswap_epi64 QWORD L_avx2_aes_gcm_bswap_epi64
 _DATA ENDS
 _DATA SEGMENT
 ALIGN 16
-L_avx2_aes_gcm_bswap_mask QWORD 579005069656919567, 283686952306183
+L_avx2_aes_gcm_bswap_mask QWORD \
+     08090a0b0c0d0e0fh,  0001020304050607h
 ptr_L_avx2_aes_gcm_bswap_mask QWORD L_avx2_aes_gcm_bswap_mask
 _DATA ENDS
 _DATA SEGMENT
 ALIGN 16
-L_avx2_aes_gcm_mod2_128 QWORD 1, 13979173243358019584
+L_avx2_aes_gcm_mod2_128 QWORD \
+     0000000000000001h, 0c200000000000000h
 ptr_L_avx2_aes_gcm_mod2_128 QWORD L_avx2_aes_gcm_mod2_128
 _DATA ENDS
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 AES_GCM_encrypt_avx2 PROC
         push	r13
         push	rdi
@@ -13504,8 +13545,8 @@ L_AES_GCM_encrypt_avx2_store_tag_done:
         pop	r13
         ret
 AES_GCM_encrypt_avx2 ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
 AES_GCM_decrypt_avx2 PROC
         push	r13
         push	rdi
@@ -14489,8 +14530,8 @@ L_AES_GCM_decrypt_avx2_cmp_tag_done:
         pop	r13
         ret
 AES_GCM_decrypt_avx2 ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
 AES_GCM_init_avx2 PROC
         push	rbx
         push	rdi
@@ -14763,8 +14804,8 @@ L_AES_GCM_init_avx2_iv_done:
         pop	rbx
         ret
 AES_GCM_init_avx2 ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
 AES_GCM_aad_update_avx2 PROC
         mov	rax, rcx
         sub	rsp, 16
@@ -14815,8 +14856,8 @@ L_AES_GCM_aad_update_avx2_16_loop:
         add	rsp, 16
         ret
 AES_GCM_aad_update_avx2 ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
 AES_GCM_encrypt_block_avx2 PROC
         mov	r10, r8
         mov	r11, r9
@@ -14870,8 +14911,8 @@ L_AES_GCM_encrypt_block_avx2_aesenc_block_last:
         add	rsp, 152
         ret
 AES_GCM_encrypt_block_avx2 ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
 AES_GCM_ghash_block_avx2 PROC
         sub	rsp, 16
         vmovdqu	OWORD PTR [rsp], xmm6
@@ -14916,8 +14957,8 @@ AES_GCM_ghash_block_avx2 PROC
         add	rsp, 16
         ret
 AES_GCM_ghash_block_avx2 ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
 AES_GCM_encrypt_update_avx2 PROC
         push	r12
         push	r13
@@ -15791,8 +15832,8 @@ L_AES_GCM_encrypt_update_avx2_done_enc:
         pop	r12
         ret
 AES_GCM_encrypt_update_avx2 ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
 AES_GCM_encrypt_final_avx2 PROC
         push	r12
         push	r13
@@ -15862,8 +15903,8 @@ L_AES_GCM_encrypt_final_avx2_store_tag_done:
         pop	r12
         ret
 AES_GCM_encrypt_final_avx2 ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
 AES_GCM_decrypt_update_avx2 PROC
         push	r13
         push	r12
@@ -16390,8 +16431,8 @@ L_AES_GCM_decrypt_update_avx2_done_dec:
         pop	r13
         ret
 AES_GCM_decrypt_update_avx2 ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
 AES_GCM_decrypt_final_avx2 PROC
         push	r12
         push	r13
@@ -16475,6 +16516,6 @@ L_AES_GCM_decrypt_final_avx2_cmp_tag_done:
         pop	r12
         ret
 AES_GCM_decrypt_final_avx2 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 END
diff --git a/wolfcrypt/src/aes_gcm_x86_asm.S b/wolfcrypt/src/aes_gcm_x86_asm.S
index 152624298d8..d24b350d56a 100644
--- a/wolfcrypt/src/aes_gcm_x86_asm.S
+++ b/wolfcrypt/src/aes_gcm_x86_asm.S
@@ -33,71 +33,93 @@
 #endif /* NO_AVX2_SUPPORT */
 
 .type	data, @object
+.align	16
 L_aes_gcm_one:
-.long	0x0,0x0,0x1,0x0
+.long	0x00000000,0x00000000,0x00000001,0x00000000
 .type	data, @object
+.align	16
 L_aes_gcm_two:
-.long	0x0,0x0,0x2,0x0
+.long	0x00000000,0x00000000,0x00000002,0x00000000
 .type	data, @object
+.align	16
 L_aes_gcm_three:
-.long	0x0,0x0,0x3,0x0
+.long	0x00000000,0x00000000,0x00000003,0x00000000
 .type	data, @object
+.align	16
 L_aes_gcm_four:
-.long	0x0,0x0,0x4,0x0
+.long	0x00000000,0x00000000,0x00000004,0x00000000
 .type	data, @object
+.align	16
 L_aes_gcm_bswap_epi64:
-.long	0x4050607,0x10203,0xc0d0e0f,0x8090a0b
+.long	0x04050607,0x00010203,0x0c0d0e0f,0x08090a0b
 .type	data, @object
+.align	16
 L_aes_gcm_bswap_mask:
-.long	0xc0d0e0f,0x8090a0b,0x4050607,0x10203
+.long	0x0c0d0e0f,0x08090a0b,0x04050607,0x00010203
 .type	data, @object
+.align	16
 L_aes_gcm_mod2_128:
-.long	0x1,0x0,0x0,0xc2000000
+.long	0x00000001,0x00000000,0x00000000,0xc2000000
 .type	data, @object
+.align	16
 L_aes_gcm_avx1_one:
-.long	0x0,0x0,0x1,0x0
+.long	0x00000000,0x00000000,0x00000001,0x00000000
 .type	data, @object
+.align	16
 L_aes_gcm_avx1_two:
-.long	0x0,0x0,0x2,0x0
+.long	0x00000000,0x00000000,0x00000002,0x00000000
 .type	data, @object
+.align	16
 L_aes_gcm_avx1_three:
-.long	0x0,0x0,0x3,0x0
+.long	0x00000000,0x00000000,0x00000003,0x00000000
 .type	data, @object
+.align	16
 L_aes_gcm_avx1_four:
-.long	0x0,0x0,0x4,0x0
+.long	0x00000000,0x00000000,0x00000004,0x00000000
 .type	data, @object
+.align	16
 L_aes_gcm_avx1_bswap_epi64:
-.long	0x4050607,0x10203,0xc0d0e0f,0x8090a0b
+.long	0x04050607,0x00010203,0x0c0d0e0f,0x08090a0b
 .type	data, @object
+.align	16
 L_aes_gcm_avx1_bswap_mask:
-.long	0xc0d0e0f,0x8090a0b,0x4050607,0x10203
+.long	0x0c0d0e0f,0x08090a0b,0x04050607,0x00010203
 .type	data, @object
+.align	16
 L_aes_gcm_avx1_mod2_128:
-.long	0x1,0x0,0x0,0xc2000000
+.long	0x00000001,0x00000000,0x00000000,0xc2000000
 .type	data, @object
+.align	16
 L_aes_gcm_avx2_one:
-.long	0x0,0x0,0x1,0x0
+.long	0x00000000,0x00000000,0x00000001,0x00000000
 .type	data, @object
+.align	16
 L_aes_gcm_avx2_two:
-.long	0x0,0x0,0x2,0x0
+.long	0x00000000,0x00000000,0x00000002,0x00000000
 .type	data, @object
+.align	16
 L_aes_gcm_avx2_three:
-.long	0x0,0x0,0x3,0x0
+.long	0x00000000,0x00000000,0x00000003,0x00000000
 .type	data, @object
+.align	16
 L_aes_gcm_avx2_four:
-.long	0x0,0x0,0x4,0x0
+.long	0x00000000,0x00000000,0x00000004,0x00000000
 .type	data, @object
+.align	16
 L_avx2_aes_gcm_bswap_one:
-.long	0x0,0x0,0x0,0x1000000
+.long	0x00000000,0x00000000,0x00000000,0x01000000
 .type	data, @object
+.align	16
 L_aes_gcm_avx2_bswap_epi64:
-.long	0x4050607,0x10203,0xc0d0e0f,0x8090a0b
+.long	0x04050607,0x00010203,0x0c0d0e0f,0x08090a0b
 .type	data, @object
+.align	16
 L_aes_gcm_avx2_bswap_mask:
-.long	0xc0d0e0f,0x8090a0b,0x4050607,0x10203
+.long	0x0c0d0e0f,0x08090a0b,0x04050607,0x00010203
 .type	data, @object
+.align	16
 L_aes_gcm_avx2_mod2_128:
-.long	0x1,0x0,0x0,0xc2000000
+.long	0x00000001,0x00000000,0x00000000,0xc2000000
 .text
 .globl	AES_GCM_encrypt_aesni
 .type	AES_GCM_encrypt_aesni,@function
diff --git a/wolfcrypt/src/aes_xts_asm.S b/wolfcrypt/src/aes_xts_asm.S
index ee646203023..09045c6d8f7 100644
--- a/wolfcrypt/src/aes_xts_asm.S
+++ b/wolfcrypt/src/aes_xts_asm.S
@@ -107,6 +107,11 @@ L_AES_XTS_init_aesni_tweak_aes_enc_block_last:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_aes_xts_gc_xts:
 .long	0x00000087,0x00000001,0x00000001,0x00000001
 #ifndef __APPLE__
@@ -1490,6 +1495,11 @@ L_AES_XTS_init_avx1_tweak_aes_enc_block_last:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_avx1_aes_xts_gc_xts:
 .long	0x00000087,0x00000001,0x00000001,0x00000001
 #ifndef __APPLE__
diff --git a/wolfcrypt/src/aes_xts_asm.asm b/wolfcrypt/src/aes_xts_asm.asm
index c28cb2c9ad2..b0e5cebf316 100644
--- a/wolfcrypt/src/aes_xts_asm.asm
+++ b/wolfcrypt/src/aes_xts_asm.asm
@@ -18,6 +18,7 @@
 ;  * along with this program; if not, write to the Free Software
 ;  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
 ;  */
+
 IF @Version LT 1200
 ; AVX2 instructions not recognized by old versions of MASM
 IFNDEF NO_AVX2_SUPPORT
@@ -40,7 +41,7 @@ IFNDEF _WIN64
 _WIN64 = 1
 ENDIF
 
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 AES_XTS_init_aesni PROC
         movdqu	xmm0, OWORD PTR [rcx]
         ; aes_enc_block
@@ -81,13 +82,14 @@ L_AES_XTS_init_aesni_tweak_aes_enc_block_last:
         movdqu	OWORD PTR [rcx], xmm0
         ret
 AES_XTS_init_aesni ENDP
-_text ENDS
+_TEXT ENDS
 _DATA SEGMENT
 ALIGN 16
-L_aes_xts_gc_xts DWORD 135,1,1,1
+L_aes_xts_gc_xts DWORD \
+     00000087h,  00000001h,  00000001h,  00000001h
 ptr_L_aes_xts_gc_xts QWORD L_aes_xts_gc_xts
 _DATA ENDS
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 AES_XTS_encrypt_aesni PROC
         push	rdi
         push	rsi
@@ -419,8 +421,8 @@ L_AES_XTS_encrypt_aesni_done_enc:
         pop	rdi
         ret
 AES_XTS_encrypt_aesni ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
 AES_XTS_encrypt_update_aesni PROC
         push	rdi
         push	rsi
@@ -715,8 +717,8 @@ L_AES_XTS_encrypt_update_aesni_done_enc:
         pop	rdi
         ret
 AES_XTS_encrypt_update_aesni ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
 AES_XTS_decrypt_aesni PROC
         push	rdi
         push	rsi
@@ -1102,8 +1104,8 @@ L_AES_XTS_decrypt_aesni_done_dec:
         pop	rdi
         ret
 AES_XTS_decrypt_aesni ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
 AES_XTS_decrypt_update_aesni PROC
         push	rdi
         push	rsi
@@ -1452,9 +1454,9 @@ L_AES_XTS_decrypt_update_aesni_done_dec:
         pop	rdi
         ret
 AES_XTS_decrypt_update_aesni ENDP
-_text ENDS
+_TEXT ENDS
 IFDEF HAVE_INTEL_AVX1
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 AES_XTS_init_avx1 PROC
         vmovdqu	xmm0, OWORD PTR [rcx]
         ; aes_enc_block
@@ -1495,13 +1497,14 @@ L_AES_XTS_init_avx1_tweak_aes_enc_block_last:
         vmovdqu	OWORD PTR [rcx], xmm0
         ret
 AES_XTS_init_avx1 ENDP
-_text ENDS
+_TEXT ENDS
 _DATA SEGMENT
 ALIGN 16
-L_avx1_aes_xts_gc_xts DWORD 135,1,1,1
+L_avx1_aes_xts_gc_xts DWORD \
+     00000087h,  00000001h,  00000001h,  00000001h
 ptr_L_avx1_aes_xts_gc_xts QWORD L_avx1_aes_xts_gc_xts
 _DATA ENDS
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 AES_XTS_encrypt_avx1 PROC
         push	rdi
         push	rsi
@@ -1824,8 +1827,8 @@ L_AES_XTS_encrypt_avx1_done_enc:
         pop	rdi
         ret
 AES_XTS_encrypt_avx1 ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
 AES_XTS_encrypt_update_avx1 PROC
         push	rdi
         push	rsi
@@ -2111,8 +2114,8 @@ L_AES_XTS_encrypt_update_avx1_done_enc:
         pop	rdi
         ret
 AES_XTS_encrypt_update_avx1 ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
 AES_XTS_decrypt_avx1 PROC
         push	rdi
         push	rsi
@@ -2487,8 +2490,8 @@ L_AES_XTS_decrypt_avx1_done_dec:
         pop	rdi
         ret
 AES_XTS_decrypt_avx1 ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
 AES_XTS_decrypt_update_avx1 PROC
         push	rdi
         push	rsi
@@ -2826,6 +2829,6 @@ L_AES_XTS_decrypt_update_avx1_done_dec:
         pop	rdi
         ret
 AES_XTS_decrypt_update_avx1 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 END
diff --git a/wolfcrypt/src/chacha_asm.S b/wolfcrypt/src/chacha_asm.S
index 2e5debb9b13..6109e22f603 100644
--- a/wolfcrypt/src/chacha_asm.S
+++ b/wolfcrypt/src/chacha_asm.S
@@ -489,7 +489,7 @@ L_chacha_x64_done:
 .p2align	4
 #endif /* __APPLE__ */
 L_chacha20_avx1_rotl8:
-.quad	0x605040702010003, 0xe0d0c0f0a09080b
+.quad	0x0605040702010003,0x0e0d0c0f0a09080b
 #ifndef __APPLE__
 .data
 #else
@@ -501,7 +501,7 @@ L_chacha20_avx1_rotl8:
 .p2align	4
 #endif /* __APPLE__ */
 L_chacha20_avx1_rotl16:
-.quad	0x504070601000302, 0xd0c0f0e09080b0a
+.quad	0x0504070601000302,0x0d0c0f0e09080b0a
 #ifndef __APPLE__
 .data
 #else
@@ -513,7 +513,7 @@ L_chacha20_avx1_rotl16:
 .p2align	4
 #endif /* __APPLE__ */
 L_chacha20_avx1_add:
-.quad	0x100000000, 0x300000002
+.quad	0x0000000100000000,0x0000000300000002
 #ifndef __APPLE__
 .data
 #else
@@ -525,7 +525,7 @@ L_chacha20_avx1_add:
 .p2align	4
 #endif /* __APPLE__ */
 L_chacha20_avx1_four:
-.quad	0x400000004, 0x400000004
+.quad	0x0000000400000004,0x0000000400000004
 #ifndef __APPLE__
 .text
 .globl	chacha_encrypt_avx1
@@ -1057,8 +1057,8 @@ L_chacha20_avx1_partial_done:
 .p2align	5
 #endif /* __APPLE__ */
 L_chacha20_avx2_rotl8:
-.quad	0x605040702010003, 0xe0d0c0f0a09080b
-.quad	0x605040702010003, 0xe0d0c0f0a09080b
+.quad	0x0605040702010003,0x0e0d0c0f0a09080b
+.quad	0x0605040702010003,0x0e0d0c0f0a09080b
 #ifndef __APPLE__
 .data
 #else
@@ -1070,8 +1070,8 @@ L_chacha20_avx2_rotl8:
 .p2align	5
 #endif /* __APPLE__ */
 L_chacha20_avx2_rotl16:
-.quad	0x504070601000302, 0xd0c0f0e09080b0a
-.quad	0x504070601000302, 0xd0c0f0e09080b0a
+.quad	0x0504070601000302,0x0d0c0f0e09080b0a
+.quad	0x0504070601000302,0x0d0c0f0e09080b0a
 #ifndef __APPLE__
 .data
 #else
@@ -1083,8 +1083,8 @@ L_chacha20_avx2_rotl16:
 .p2align	5
 #endif /* __APPLE__ */
 L_chacha20_avx2_add:
-.quad	0x100000000, 0x300000002
-.quad	0x500000004, 0x700000006
+.quad	0x0000000100000000,0x0000000300000002
+.quad	0x0000000500000004,0x0000000700000006
 #ifndef __APPLE__
 .data
 #else
@@ -1096,8 +1096,8 @@ L_chacha20_avx2_add:
 .p2align	5
 #endif /* __APPLE__ */
 L_chacha20_avx2_eight:
-.quad	0x800000008, 0x800000008
-.quad	0x800000008, 0x800000008
+.quad	0x0000000800000008,0x0000000800000008
+.quad	0x0000000800000008,0x0000000800000008
 #ifndef __APPLE__
 .text
 .globl	chacha_encrypt_avx2
diff --git a/wolfcrypt/src/chacha_asm.asm b/wolfcrypt/src/chacha_asm.asm
index e663709e8d1..b9444254c90 100644
--- a/wolfcrypt/src/chacha_asm.asm
+++ b/wolfcrypt/src/chacha_asm.asm
@@ -18,6 +18,7 @@
 ;  * along with this program; if not, write to the Free Software
 ;  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
 ;  */
+
 IF @Version LT 1200
 ; AVX2 instructions not recognized by old versions of MASM
 IFNDEF NO_AVX2_SUPPORT
@@ -40,7 +41,7 @@ IFNDEF _WIN64
 _WIN64 = 1
 ENDIF
 
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 chacha_encrypt_x64 PROC
         push	rbx
         push	rbp
@@ -457,29 +458,33 @@ L_chacha_x64_done:
         pop	rbx
         ret
 chacha_encrypt_x64 ENDP
-_text ENDS
+_TEXT ENDS
 IFDEF HAVE_INTEL_AVX1
 _DATA SEGMENT
 ALIGN 16
-L_chacha20_avx1_rotl8 QWORD 433757367256023043, 1012478749960636427
+L_chacha20_avx1_rotl8 QWORD \
+     0605040702010003h,  0e0d0c0f0a09080bh
 ptr_L_chacha20_avx1_rotl8 QWORD L_chacha20_avx1_rotl8
 _DATA ENDS
 _DATA SEGMENT
 ALIGN 16
-L_chacha20_avx1_rotl16 QWORD 361421592464458498, 940142975169071882
+L_chacha20_avx1_rotl16 QWORD \
+     0504070601000302h,  0d0c0f0e09080b0ah
 ptr_L_chacha20_avx1_rotl16 QWORD L_chacha20_avx1_rotl16
 _DATA ENDS
 _DATA SEGMENT
 ALIGN 16
-L_chacha20_avx1_add QWORD 4294967296, 12884901890
+L_chacha20_avx1_add QWORD \
+     0000000100000000h,  0000000300000002h
 ptr_L_chacha20_avx1_add QWORD L_chacha20_avx1_add
 _DATA ENDS
 _DATA SEGMENT
 ALIGN 16
-L_chacha20_avx1_four QWORD 17179869188, 17179869188
+L_chacha20_avx1_four QWORD \
+     0000000400000004h,  0000000400000004h
 ptr_L_chacha20_avx1_four QWORD L_chacha20_avx1_four
 _DATA ENDS
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 chacha_encrypt_avx1 PROC
         push	r12
         push	r13
@@ -1009,34 +1014,38 @@ L_chacha20_avx1_partial_done:
         pop	r12
         ret
 chacha_encrypt_avx1 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 IFDEF HAVE_INTEL_AVX2
 _DATA SEGMENT
 ALIGN 16
-L_chacha20_avx2_rotl8 QWORD 433757367256023043, 1012478749960636427,
-    433757367256023043, 1012478749960636427
+L_chacha20_avx2_rotl8 QWORD \
+     0605040702010003h,  0e0d0c0f0a09080bh,
+     0605040702010003h,  0e0d0c0f0a09080bh
 ptr_L_chacha20_avx2_rotl8 QWORD L_chacha20_avx2_rotl8
 _DATA ENDS
 _DATA SEGMENT
 ALIGN 16
-L_chacha20_avx2_rotl16 QWORD 361421592464458498, 940142975169071882,
-    361421592464458498, 940142975169071882
+L_chacha20_avx2_rotl16 QWORD \
+     0504070601000302h,  0d0c0f0e09080b0ah,
+     0504070601000302h,  0d0c0f0e09080b0ah
 ptr_L_chacha20_avx2_rotl16 QWORD L_chacha20_avx2_rotl16
 _DATA ENDS
 _DATA SEGMENT
 ALIGN 16
-L_chacha20_avx2_add QWORD 4294967296, 12884901890,
-    21474836484, 30064771078
+L_chacha20_avx2_add QWORD \
+     0000000100000000h,  0000000300000002h,
+     0000000500000004h,  0000000700000006h
 ptr_L_chacha20_avx2_add QWORD L_chacha20_avx2_add
 _DATA ENDS
 _DATA SEGMENT
 ALIGN 16
-L_chacha20_avx2_eight QWORD 34359738376, 34359738376,
-    34359738376, 34359738376
+L_chacha20_avx2_eight QWORD \
+     0000000800000008h,  0000000800000008h,
+     0000000800000008h,  0000000800000008h
 ptr_L_chacha20_avx2_eight QWORD L_chacha20_avx2_eight
 _DATA ENDS
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 chacha_encrypt_avx2 PROC
         push	r12
         push	r13
@@ -1420,6 +1429,6 @@ L_chacha20_avx2_end256:
         pop	r12
         ret
 chacha_encrypt_avx2 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 END
diff --git a/wolfcrypt/src/fe_x25519_asm.S b/wolfcrypt/src/fe_x25519_asm.S
index f4cdf343c0a..ed53856bca8 100644
--- a/wolfcrypt/src/fe_x25519_asm.S
+++ b/wolfcrypt/src/fe_x25519_asm.S
@@ -2343,8 +2343,8 @@ _fe_invert_x64:
 .p2align	5
 #endif /* __APPLE__ */
 L_curve25519_base_x64_x2:
-.quad	0x5cae469cdd684efb, 0x8f3f5ced1e350b5c
-.quad	0xd9750c687d157114, 0x20d342d51873f1b7
+.quad	0x5cae469cdd684efb,0x8f3f5ced1e350b5c
+.quad	0xd9750c687d157114,0x20d342d51873f1b7
 #ifndef __APPLE__
 .text
 .globl	curve25519_base_x64
@@ -8852,7 +8852,7 @@ _ge_p2_dbl_x64:
         shldq	$0x01, %r11, %r12
         shldq	$0x01, %r10, %r11
         shldq	$0x01, %r9, %r10
-        shlq	$0x01, %r9
+        shlq	$1, %r9
         movq	$0x7fffffffffffffff, %r8
         shrq	$62, %rax
         andq	%r8, %r12
@@ -11629,7 +11629,7 @@ _fe_sq2_x64:
         shldq	$0x01, %r9, %r10
         shldq	$0x01, %r8, %r9
         shldq	$0x01, %rcx, %r8
-        shlq	$0x01, %rcx
+        shlq	$1, %rcx
         movq	$0x7fffffffffffffff, %r15
         shrq	$62, %rax
         andq	%r15, %r10
@@ -12157,10 +12157,10 @@ _fe_invert_nct_x64:
         testb	$0x01, %r11b
         jnz	fe_invert_nct_v_even_end
 fe_invert_nct_v_even_start:
-        shrdq	$0x01, %r12, %r11
-        shrdq	$0x01, %r13, %r12
-        shrdq	$0x01, %r14, %r13
-        shrq	$0x01, %r14
+        shrdq	$1, %r12, %r11
+        shrdq	$1, %r13, %r12
+        shrdq	$1, %r14, %r13
+        shrq	$1, %r14
         movb	$0x01, (%rsp,%r15,1)
         incq	%r15
         testb	$0x01, %r11b
@@ -12185,17 +12185,17 @@ L_fe_invert_nct_uv_u:
         sbbq	%r12, %r8
         sbbq	%r13, %r9
         sbbq	%r14, %r10
-        shrdq	$0x01, %r8, %rcx
-        shrdq	$0x01, %r9, %r8
-        shrdq	$0x01, %r10, %r9
-        shrq	$0x01, %r10
+        shrdq	$1, %r8, %rcx
+        shrdq	$1, %r9, %r8
+        shrdq	$1, %r10, %r9
+        shrq	$1, %r10
         testb	$0x01, %cl
         jnz	fe_invert_nct_usubv_even_end
 fe_invert_nct_usubv_even_start:
-        shrdq	$0x01, %r8, %rcx
-        shrdq	$0x01, %r9, %r8
-        shrdq	$0x01, %r10, %r9
-        shrq	$0x01, %r10
+        shrdq	$1, %r8, %rcx
+        shrdq	$1, %r9, %r8
+        shrdq	$1, %r10, %r9
+        shrq	$1, %r10
         movb	$0x00, (%rsp,%r15,1)
         incq	%r15
         testb	$0x01, %cl
@@ -12217,17 +12217,17 @@ L_fe_invert_nct_uv_v:
         sbbq	%r8, %r12
         sbbq	%r9, %r13
         sbbq	%r10, %r14
-        shrdq	$0x01, %r12, %r11
-        shrdq	$0x01, %r13, %r12
-        shrdq	$0x01, %r14, %r13
-        shrq	$0x01, %r14
+        shrdq	$1, %r12, %r11
+        shrdq	$1, %r13, %r12
+        shrdq	$1, %r14, %r13
+        shrq	$1, %r14
         testb	$0x01, %r11b
         jnz	fe_invert_nct_vsubu_even_end
 fe_invert_nct_vsubu_even_start:
-        shrdq	$0x01, %r12, %r11
-        shrdq	$0x01, %r13, %r12
-        shrdq	$0x01, %r14, %r13
-        shrq	$0x01, %r14
+        shrdq	$1, %r12, %r11
+        shrdq	$1, %r13, %r12
+        shrdq	$1, %r14, %r13
+        shrq	$1, %r14
         movb	$0x01, (%rsp,%r15,1)
         incq	%r15
         testb	$0x01, %r11b
@@ -12282,10 +12282,10 @@ L_fe_invert_nct_op_div2_b:
         movq	$0x7fffffffffffffff, %rdx
         adcq	%rdx, %r10
 L_fe_invert_nct_op_div2_b_mod:
-        shrdq	$0x01, %r8, %rcx
-        shrdq	$0x01, %r9, %r8
-        shrdq	$0x01, %r10, %r9
-        shrq	$0x01, %r10
+        shrdq	$1, %r8, %rcx
+        shrdq	$1, %r9, %r8
+        shrdq	$1, %r10, %r9
+        shrq	$1, %r10
         movb	(%rsp,%r15,1), %dl
         incq	%r15
         cmpb	$0x01, %dl
@@ -12317,10 +12317,10 @@ L_fe_invert_nct_op_div2_d:
         movq	$0x7fffffffffffffff, %rdx
         adcq	%rdx, %r14
 L_fe_invert_nct_op_div2_d_mod:
-        shrdq	$0x01, %r12, %r11
-        shrdq	$0x01, %r13, %r12
-        shrdq	$0x01, %r14, %r13
-        shrq	$0x01, %r14
+        shrdq	$1, %r12, %r11
+        shrdq	$1, %r13, %r12
+        shrdq	$1, %r14, %r13
+        shrq	$1, %r14
         movb	(%rsp,%r15,1), %dl
         incq	%r15
         cmpb	$0x01, %dl
@@ -13201,8 +13201,8 @@ _fe_invert_avx2:
 .p2align	5
 #endif /* __APPLE__ */
 L_curve25519_base_avx2_x2:
-.quad	0x5cae469cdd684efb, 0x8f3f5ced1e350b5c
-.quad	0xd9750c687d157114, 0x20d342d51873f1b7
+.quad	0x5cae469cdd684efb,0x8f3f5ced1e350b5c
+.quad	0xd9750c687d157114,0x20d342d51873f1b7
 #ifndef __APPLE__
 .text
 .globl	curve25519_base_avx2
@@ -18609,7 +18609,7 @@ _ge_p2_dbl_avx2:
         shldq	$0x01, %r12, %r13
         shldq	$0x01, %r11, %r12
         shldq	$0x01, %r10, %r11
-        shlq	$0x01, %r10
+        shlq	$1, %r10
         movq	$0x7fffffffffffffff, %rcx
         shrq	$62, %r9
         andq	%rcx, %r13
@@ -20979,7 +20979,7 @@ _fe_sq2_avx2:
         shldq	$0x01, %r10, %r11
         shldq	$0x01, %r9, %r10
         shldq	$0x01, %r8, %r9
-        shlq	$0x01, %r8
+        shlq	$1, %r8
         movq	$0x7fffffffffffffff, %rcx
         shrq	$62, %rax
         andq	%rcx, %r11
@@ -21415,7 +21415,12 @@ _sc_muladd_avx2:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
-L_sp_mod_inv_avx2__prime:
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
+L_fe_invert_nct_avx2_prime:
 .long	0x03ffffed,0x03ffffff,0x03ffffff,0x03ffffff
 .long	0x03ffffff,0x00000000,0x00000000,0x00000000
 .long	0x03ffffff,0x03ffffff,0x03ffffff,0x03ffffff
@@ -21430,15 +21435,20 @@ L_sp_mod_inv_avx2__prime:
 #else
 .p2align	5
 #endif /* __APPLE__ */
-L_sp_mod_inv_avx2__one:
-.quad	0x1, 0x0
-.quad	0x0, 0x0
+L_fe_invert_nct_avx2_one:
+.quad	0x0000000000000001,0x0000000000000000
+.quad	0x0000000000000000,0x0000000000000000
 #ifndef __APPLE__
 .data
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
-L_sp_mod_inv_avx2__all_one:
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
+L_fe_invert_nct_avx2_all_one:
 .long	0x00000001,0x00000001,0x00000001,0x00000001
 .long	0x00000001,0x00000001,0x00000001,0x00000001
 #ifndef __APPLE__
@@ -21446,7 +21456,12 @@ L_sp_mod_inv_avx2__all_one:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
-L_sp_mod_inv_avx2__mask01111:
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
+L_fe_invert_nct_avx2_mask01111:
 .long	0x00000000,0x00000001,0x00000001,0x00000001
 .long	0x00000001,0x00000000,0x00000000,0x00000000
 #ifndef __APPLE__
@@ -21454,7 +21469,12 @@ L_sp_mod_inv_avx2__mask01111:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
-L_sp_mod_inv_avx2__down_one_dword:
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
+L_fe_invert_nct_avx2_down_one_dword:
 .long	0x00000001,0x00000002,0x00000003,0x00000004
 .long	0x00000005,0x00000006,0x00000007,0x00000007
 #ifndef __APPLE__
@@ -21462,7 +21482,12 @@ L_sp_mod_inv_avx2__down_one_dword:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
-L_sp_mod_inv_avx2__neg:
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
+L_fe_invert_nct_avx2_neg:
 .long	0x00000000,0x00000000,0x00000000,0x00000000
 .long	0x80000000,0x00000000,0x00000000,0x00000000
 #ifndef __APPLE__
@@ -21470,7 +21495,12 @@ L_sp_mod_inv_avx2__neg:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
-L_sp_mod_inv_avx2__up_one_dword:
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
+L_fe_invert_nct_avx2_up_one_dword:
 .long	0x00000007,0x00000000,0x00000001,0x00000002
 .long	0x00000003,0x00000007,0x00000007,0x00000007
 #ifndef __APPLE__
@@ -21478,7 +21508,12 @@ L_sp_mod_inv_avx2__up_one_dword:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
-L_sp_mod_inv_avx2__mask26:
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
+L_fe_invert_nct_avx2_mask26:
 .long	0x03ffffff,0x03ffffff,0x03ffffff,0x03ffffff
 .long	0x03ffffff,0x00000000,0x00000000,0x00000000
 /* Non-constant time modular inversion.
@@ -21513,39 +21548,39 @@ _fe_invert_nct_avx2:
         movq	8(%rsi), %r11
         movq	16(%rsi), %r12
         movq	24(%rsi), %r13
-        leaq	L_sp_mod_inv_avx2__prime(%rip), %rbx
+        leaq	L_fe_invert_nct_avx2_prime(%rip), %rbx
         vmovupd	(%rbx), %ymm6
         vmovupd	32(%rbx), %ymm7
-        leaq	L_sp_mod_inv_avx2__one(%rip), %rbx
+        leaq	L_fe_invert_nct_avx2_one(%rip), %rbx
         vmovupd	(%rbx), %ymm8
-        leaq	L_sp_mod_inv_avx2__mask01111(%rip), %rbx
+        leaq	L_fe_invert_nct_avx2_mask01111(%rip), %rbx
         vmovupd	(%rbx), %ymm9
-        leaq	L_sp_mod_inv_avx2__all_one(%rip), %rbx
+        leaq	L_fe_invert_nct_avx2_all_one(%rip), %rbx
         vmovupd	(%rbx), %ymm10
-        leaq	L_sp_mod_inv_avx2__down_one_dword(%rip), %rbx
+        leaq	L_fe_invert_nct_avx2_down_one_dword(%rip), %rbx
         vmovupd	(%rbx), %ymm11
-        leaq	L_sp_mod_inv_avx2__neg(%rip), %rbx
+        leaq	L_fe_invert_nct_avx2_neg(%rip), %rbx
         vmovupd	(%rbx), %ymm12
-        leaq	L_sp_mod_inv_avx2__up_one_dword(%rip), %rbx
+        leaq	L_fe_invert_nct_avx2_up_one_dword(%rip), %rbx
         vmovupd	(%rbx), %ymm13
-        leaq	L_sp_mod_inv_avx2__mask26(%rip), %rbx
+        leaq	L_fe_invert_nct_avx2_mask26(%rip), %rbx
         vmovupd	(%rbx), %ymm14
         vpxor	%xmm0, %xmm0, %xmm0
         vpxor	%xmm1, %xmm1, %xmm1
         vmovdqu	%ymm8, %ymm2
         vpxor	%xmm3, %xmm3, %xmm3
         testb	$0x01, %r10b
-        jnz	L__mod_inv_avx2__v_even_end
-L__mod_inv_avx2__v_even_start:
-        shrdq	$0x01, %r11, %r10
-        shrdq	$0x01, %r12, %r11
-        shrdq	$0x01, %r13, %r12
-        shrq	$0x01, %r13
+        jnz	L_fe_invert_nct_avx2_v_even_end
+L_fe_invert_nct_avx2_v_even_start:
+        shrdq	$1, %r11, %r10
+        shrdq	$1, %r12, %r11
+        shrdq	$1, %r13, %r12
+        shrq	$1, %r13
         vptest	%ymm8, %ymm2
-        jz	L__mod_inv_avx2__v_even_shr1
+        jz	L_fe_invert_nct_avx2_v_even_shr1
         vpaddd	%ymm6, %ymm2, %ymm2
         vpaddd	%ymm7, %ymm3, %ymm3
-L__mod_inv_avx2__v_even_shr1:
+L_fe_invert_nct_avx2_v_even_shr1:
         vpand	%ymm9, %ymm2, %ymm4
         vpand	%ymm10, %ymm3, %ymm5
         vpermd	%ymm4, %ymm11, %ymm4
@@ -21556,21 +21591,21 @@ L__mod_inv_avx2__v_even_shr1:
         vpaddd	%ymm5, %ymm2, %ymm2
         vpaddd	%ymm4, %ymm3, %ymm3
         testb	$0x01, %r10b
-        jz	L__mod_inv_avx2__v_even_start
-L__mod_inv_avx2__v_even_end:
-L__mod_inv_avx2__uv_start:
+        jz	L_fe_invert_nct_avx2_v_even_start
+L_fe_invert_nct_avx2_v_even_end:
+L_fe_invert_nct_avx2_uv_start:
         cmpq	%r13, %r9
-        jb	L__mod_inv_avx2__uv_v
-        ja	L__mod_inv_avx2__uv_u
+        jb	L_fe_invert_nct_avx2_uv_v
+        ja	L_fe_invert_nct_avx2_uv_u
         cmpq	%r12, %r8
-        jb	L__mod_inv_avx2__uv_v
-        ja	L__mod_inv_avx2__uv_u
+        jb	L_fe_invert_nct_avx2_uv_v
+        ja	L_fe_invert_nct_avx2_uv_u
         cmpq	%r11, %rcx
-        jb	L__mod_inv_avx2__uv_v
-        ja	L__mod_inv_avx2__uv_u
+        jb	L_fe_invert_nct_avx2_uv_v
+        ja	L_fe_invert_nct_avx2_uv_u
         cmpq	%r10, %rax
-        jb	L__mod_inv_avx2__uv_v
-L__mod_inv_avx2__uv_u:
+        jb	L_fe_invert_nct_avx2_uv_v
+L_fe_invert_nct_avx2_uv_u:
         subq	%r10, %rax
         sbbq	%r11, %rcx
         vpsubd	%ymm2, %ymm0, %ymm0
@@ -21578,20 +21613,20 @@ L__mod_inv_avx2__uv_u:
         vpsubd	%ymm3, %ymm1, %ymm1
         sbbq	%r13, %r9
         vptest	%ymm12, %ymm1
-        jz	L__mod_inv_avx2__usubv_done_neg
+        jz	L_fe_invert_nct_avx2_usubv_done_neg
         vpaddd	%ymm6, %ymm0, %ymm0
         vpaddd	%ymm7, %ymm1, %ymm1
-L__mod_inv_avx2__usubv_done_neg:
-L__mod_inv_avx2__usubv_shr1:
-        shrdq	$0x01, %rcx, %rax
-        shrdq	$0x01, %r8, %rcx
-        shrdq	$0x01, %r9, %r8
-        shrq	$0x01, %r9
+L_fe_invert_nct_avx2_usubv_done_neg:
+L_fe_invert_nct_avx2_usubv_shr1:
+        shrdq	$1, %rcx, %rax
+        shrdq	$1, %r8, %rcx
+        shrdq	$1, %r9, %r8
+        shrq	$1, %r9
         vptest	%ymm8, %ymm0
-        jz	L__mod_inv_avx2__usubv_sub_shr1
+        jz	L_fe_invert_nct_avx2_usubv_sub_shr1
         vpaddd	%ymm6, %ymm0, %ymm0
         vpaddd	%ymm7, %ymm1, %ymm1
-L__mod_inv_avx2__usubv_sub_shr1:
+L_fe_invert_nct_avx2_usubv_sub_shr1:
         vpand	%ymm9, %ymm0, %ymm4
         vpand	%ymm10, %ymm1, %ymm5
         vpermd	%ymm4, %ymm11, %ymm4
@@ -21602,14 +21637,14 @@ L__mod_inv_avx2__usubv_sub_shr1:
         vpaddd	%ymm5, %ymm0, %ymm0
         vpaddd	%ymm4, %ymm1, %ymm1
         testb	$0x01, %al
-        jz	L__mod_inv_avx2__usubv_shr1
+        jz	L_fe_invert_nct_avx2_usubv_shr1
         cmpq	$0x01, %rax
-        jne	L__mod_inv_avx2__uv_start
+        jne	L_fe_invert_nct_avx2_uv_start
         movq	%rcx, %rdx
         orq	%r8, %rdx
-        jne	L__mod_inv_avx2__uv_start
+        jne	L_fe_invert_nct_avx2_uv_start
         orq	%r9, %rdx
-        jne	L__mod_inv_avx2__uv_start
+        jne	L_fe_invert_nct_avx2_uv_start
         vpextrd	$0x00, %xmm0, %eax
         vpextrd	$0x01, %xmm0, %r8d
         vpextrd	$2, %xmm0, %r10d
@@ -21622,8 +21657,8 @@ L__mod_inv_avx2__usubv_sub_shr1:
         vextracti128	$0x01, %ymm1, %xmm1
         vpextrd	$0x00, %xmm0, %r14d
         vpextrd	$0x00, %xmm1, %r15d
-        jmp	L__mod_inv_avx2__store_done
-L__mod_inv_avx2__uv_v:
+        jmp	L_fe_invert_nct_avx2_store_done
+L_fe_invert_nct_avx2_uv_v:
         subq	%rax, %r10
         sbbq	%rcx, %r11
         vpsubd	%ymm0, %ymm2, %ymm2
@@ -21631,20 +21666,20 @@ L__mod_inv_avx2__uv_v:
         vpsubd	%ymm1, %ymm3, %ymm3
         sbbq	%r9, %r13
         vptest	%ymm12, %ymm3
-        jz	L__mod_inv_avx2__vsubu_done_neg
+        jz	L_fe_invert_nct_avx2_vsubu_done_neg
         vpaddd	%ymm6, %ymm2, %ymm2
         vpaddd	%ymm7, %ymm3, %ymm3
-L__mod_inv_avx2__vsubu_done_neg:
-L__mod_inv_avx2__vsubu_shr1:
-        shrdq	$0x01, %r11, %r10
-        shrdq	$0x01, %r12, %r11
-        shrdq	$0x01, %r13, %r12
-        shrq	$0x01, %r13
+L_fe_invert_nct_avx2_vsubu_done_neg:
+L_fe_invert_nct_avx2_vsubu_shr1:
+        shrdq	$1, %r11, %r10
+        shrdq	$1, %r12, %r11
+        shrdq	$1, %r13, %r12
+        shrq	$1, %r13
         vptest	%ymm8, %ymm2
-        jz	L__mod_inv_avx2__vsubu_sub_shr1
+        jz	L_fe_invert_nct_avx2_vsubu_sub_shr1
         vpaddd	%ymm6, %ymm2, %ymm2
         vpaddd	%ymm7, %ymm3, %ymm3
-L__mod_inv_avx2__vsubu_sub_shr1:
+L_fe_invert_nct_avx2_vsubu_sub_shr1:
         vpand	%ymm9, %ymm2, %ymm4
         vpand	%ymm10, %ymm3, %ymm5
         vpermd	%ymm4, %ymm11, %ymm4
@@ -21655,14 +21690,14 @@ L__mod_inv_avx2__vsubu_sub_shr1:
         vpaddd	%ymm5, %ymm2, %ymm2
         vpaddd	%ymm4, %ymm3, %ymm3
         testb	$0x01, %r10b
-        jz	L__mod_inv_avx2__vsubu_shr1
+        jz	L_fe_invert_nct_avx2_vsubu_shr1
         cmpq	$0x01, %r10
-        jne	L__mod_inv_avx2__uv_start
+        jne	L_fe_invert_nct_avx2_uv_start
         movq	%r11, %rdx
         orq	%r12, %rdx
-        jne	L__mod_inv_avx2__uv_start
+        jne	L_fe_invert_nct_avx2_uv_start
         orq	%r13, %rdx
-        jne	L__mod_inv_avx2__uv_start
+        jne	L_fe_invert_nct_avx2_uv_start
         vpextrd	$0x00, %xmm2, %eax
         vpextrd	$0x01, %xmm2, %r8d
         vpextrd	$2, %xmm2, %r10d
@@ -21675,7 +21710,7 @@ L__mod_inv_avx2__vsubu_sub_shr1:
         vextracti128	$0x01, %ymm3, %xmm3
         vpextrd	$0x00, %xmm2, %r14d
         vpextrd	$0x00, %xmm3, %r15d
-L__mod_inv_avx2__store_done:
+L_fe_invert_nct_avx2_store_done:
         movl	%eax, %edx
         andl	$0x3ffffff, %eax
         sarl	$26, %edx
@@ -21732,7 +21767,7 @@ L__mod_inv_avx2__store_done:
         adcq	%r13, %r12
         movslq	%r14d, %r14
         adcq	%r15, %r14
-        jge	L__mod_inv_avx2__3_no_add_prime
+        jge	L_fe_invert_nct_avx2_uv_start_no_add_prime
         movq	$0xfffffffffffed, %rcx
         movq	$0xfffffffffffff, %r9
         movq	$0xfffffffffffff, %r11
@@ -21760,7 +21795,7 @@ L__mod_inv_avx2__store_done:
         andq	%rdx, %r12
         sarq	$52, %r13
         addq	%r13, %r14
-L__mod_inv_avx2__3_no_add_prime:
+L_fe_invert_nct_avx2_uv_start_no_add_prime:
         movq	%r8, %rcx
         movq	%r10, %r9
         movq	%r12, %r11
diff --git a/wolfcrypt/src/poly1305_asm.S b/wolfcrypt/src/poly1305_asm.S
index a1e6f68dbe2..7f73e87b67e 100644
--- a/wolfcrypt/src/poly1305_asm.S
+++ b/wolfcrypt/src/poly1305_asm.S
@@ -672,8 +672,8 @@ _poly1305_setkey_avx2:
 .p2align	5
 #endif /* __APPLE__ */
 L_poly1305_avx2_blocks_mask:
-.quad	0x3ffffff, 0x3ffffff
-.quad	0x3ffffff, 0x3ffffff
+.quad	0x0000000003ffffff,0x0000000003ffffff
+.quad	0x0000000003ffffff,0x0000000003ffffff
 #ifndef __APPLE__
 .data
 #else
@@ -685,8 +685,8 @@ L_poly1305_avx2_blocks_mask:
 .p2align	5
 #endif /* __APPLE__ */
 L_poly1305_avx2_blocks_hibit:
-.quad	0x1000000, 0x1000000
-.quad	0x1000000, 0x1000000
+.quad	0x0000000001000000,0x0000000001000000
+.quad	0x0000000001000000,0x0000000001000000
 #ifndef __APPLE__
 .text
 .globl	poly1305_blocks_avx2
diff --git a/wolfcrypt/src/poly1305_asm.asm b/wolfcrypt/src/poly1305_asm.asm
index ecabf55b96b..de7e5259ae5 100644
--- a/wolfcrypt/src/poly1305_asm.asm
+++ b/wolfcrypt/src/poly1305_asm.asm
@@ -18,6 +18,7 @@
 ;  * along with this program; if not, write to the Free Software
 ;  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
 ;  */
+
 IF @Version LT 1200
 ; AVX2 instructions not recognized by old versions of MASM
 IFNDEF NO_AVX2_SUPPORT
@@ -41,7 +42,7 @@ _WIN64 = 1
 ENDIF
 
 IFDEF HAVE_INTEL_AVX1
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 poly1305_setkey_avx PROC
         push	r12
         push	r13
@@ -93,8 +94,8 @@ poly1305_setkey_avx PROC
         pop	r12
         ret
 poly1305_setkey_avx ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
 poly1305_block_avx PROC
         push	r15
         push	rbx
@@ -166,8 +167,8 @@ poly1305_block_avx PROC
         pop	r15
         ret
 poly1305_block_avx ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
 poly1305_blocks_avx PROC
         push	rdi
         push	rsi
@@ -249,8 +250,8 @@ L_poly1305_avx_blocks_start:
         pop	rdi
         ret
 poly1305_blocks_avx ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
 poly1305_final_avx PROC
         push	rdi
         push	rbx
@@ -318,10 +319,10 @@ L_poly1305_avx_final_no_more:
         pop	rdi
         ret
 poly1305_final_avx ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 IFDEF HAVE_INTEL_AVX2
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 poly1305_calc_powers_avx2 PROC
         push	r12
         push	r13
@@ -581,8 +582,8 @@ poly1305_calc_powers_avx2 PROC
         pop	r12
         ret
 poly1305_calc_powers_avx2 ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
 poly1305_setkey_avx2 PROC
         call	poly1305_setkey_avx
         vpxor	ymm0, ymm0, ymm0
@@ -595,20 +596,22 @@ poly1305_setkey_avx2 PROC
         mov	WORD PTR [rcx+616], 0
         ret
 poly1305_setkey_avx2 ENDP
-_text ENDS
+_TEXT ENDS
 _DATA SEGMENT
 ALIGN 16
-L_poly1305_avx2_blocks_mask QWORD 67108863, 67108863,
-    67108863, 67108863
+L_poly1305_avx2_blocks_mask QWORD \
+     0000000003ffffffh,  0000000003ffffffh,
+     0000000003ffffffh,  0000000003ffffffh
 ptr_L_poly1305_avx2_blocks_mask QWORD L_poly1305_avx2_blocks_mask
 _DATA ENDS
 _DATA SEGMENT
 ALIGN 16
-L_poly1305_avx2_blocks_hibit QWORD 16777216, 16777216,
-    16777216, 16777216
+L_poly1305_avx2_blocks_hibit QWORD \
+     0000000001000000h,  0000000001000000h,
+     0000000001000000h,  0000000001000000h
 ptr_L_poly1305_avx2_blocks_hibit QWORD L_poly1305_avx2_blocks_hibit
 _DATA ENDS
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 poly1305_blocks_avx2 PROC
         push	r12
         push	rdi
@@ -990,8 +993,8 @@ L_poly1305_avx2_blocks_complete:
         pop	r12
         ret
 poly1305_blocks_avx2 ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
 poly1305_final_avx2 PROC
         push	rdi
         push	rsi
@@ -1055,6 +1058,6 @@ L_poly1305_avx2_final_cmp_copy:
         pop	rdi
         ret
 poly1305_final_avx2 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 END
diff --git a/wolfcrypt/src/port/arm/armv8-32-aes-asm.S b/wolfcrypt/src/port/arm/armv8-32-aes-asm.S
index 88882b3a486..2112845ce06 100644
--- a/wolfcrypt/src/port/arm/armv8-32-aes-asm.S
+++ b/wolfcrypt/src/port/arm/armv8-32-aes-asm.S
@@ -968,7 +968,7 @@ L_aes_set_key_arm32_crypto_done:
 	.globl	AES_encrypt_AARCH32
 	.type	AES_encrypt_AARCH32, %function
 AES_encrypt_AARCH32:
-	vpush	{d8, d9}
+	vpush	{d8-d9}
 	vld1.8	{q0}, [r0]
 	vldm	r2!, {q1-q4}
 	aese.8	q0, q1
@@ -989,19 +989,19 @@ AES_encrypt_AARCH32:
 	aese.8	q0, q4
 	aesmc.8	q0, q0
 	subs	r3, r3, #10
-	vld1.32	{q1, q2}, [r2]!
+	vld1.32	{q1-q2}, [r2]!
 	aese.8	q0, q1
 	aesmc.8	q0, q0
 	aese.8	q0, q2
 	beq	L_aes_encrypt_arm32_crypto_round_done
-	vld1.32	{q1, q2}, [r2]!
+	vld1.32	{q1-q2}, [r2]!
 	subs	r3, r3, #2
 	aesmc.8	q0, q0
 	aese.8	q0, q1
 	aesmc.8	q0, q0
 	aese.8	q0, q2
 	beq	L_aes_encrypt_arm32_crypto_round_done
-	vld1.32	{q1, q2}, [r2]!
+	vld1.32	{q1-q2}, [r2]!
 	aesmc.8	q0, q0
 	aese.8	q0, q1
 	aesmc.8	q0, q0
@@ -1010,7 +1010,7 @@ L_aes_encrypt_arm32_crypto_round_done:
 	vld1.32	{q1}, [r2]
 	veor.32	q0, q0, q1
 	vst1.8	{q0}, [r1]
-	vpop	{d8, d9}
+	vpop	{d8-d9}
 	bx	lr
 	.size	AES_encrypt_AARCH32,.-AES_encrypt_AARCH32
 #endif /* defined(HAVE_AESCCM) || defined(HAVE_AESGCM) || defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) || defined(HAVE_AES_CBC) */
@@ -1021,7 +1021,7 @@ L_aes_encrypt_arm32_crypto_round_done:
 	.globl	AES_decrypt_AARCH32
 	.type	AES_decrypt_AARCH32, %function
 AES_decrypt_AARCH32:
-	vpush	{d8, d9}
+	vpush	{d8-d9}
 	vld1.8	{q0}, [r0]
 	vldm	r2!, {q1-q4}
 	aesd.8	q0, q1
@@ -1041,20 +1041,20 @@ AES_decrypt_AARCH32:
 	aesimc.8	q0, q0
 	aesd.8	q0, q4
 	aesimc.8	q0, q0
-	vld1.32	{q1, q2}, [r2]!
+	vld1.32	{q1-q2}, [r2]!
 	aesd.8	q0, q1
 	aesimc.8	q0, q0
 	aesd.8	q0, q2
 	subs	r3, r3, #10
 	beq	L_aes_decrypt_arm32_crypto_round_done
-	vld1.32	{q1, q2}, [r2]!
+	vld1.32	{q1-q2}, [r2]!
 	aesimc.8	q0, q0
 	aesd.8	q0, q1
 	aesimc.8	q0, q0
 	aesd.8	q0, q2
 	subs	r3, r3, #2
 	beq	L_aes_decrypt_arm32_crypto_round_done
-	vld1.32	{q1, q2}, [r2]!
+	vld1.32	{q1-q2}, [r2]!
 	aesimc.8	q0, q0
 	aesd.8	q0, q1
 	aesimc.8	q0, q0
@@ -1063,7 +1063,7 @@ L_aes_decrypt_arm32_crypto_round_done:
 	vld1.32	{q1}, [r2]
 	veor.32	q0, q0, q1
 	vst1.8	{q0}, [r1]
-	vpop	{d8, d9}
+	vpop	{d8-d9}
 	bx	lr
 	.size	AES_decrypt_AARCH32,.-AES_decrypt_AARCH32
 #endif /* HAVE_AES_DECRYPT */
@@ -1083,7 +1083,7 @@ AES_encrypt_blocks_AARCH32:
 	bgt	L_aes_encrypt_blocks_arm32_crypto_start_256
 	# AES_ECB_192
 #ifndef NO_AES_192
-	vld1.32	{q8, q9}, [r3]!
+	vld1.32	{q8-q9}, [r3]!
 	cmp	r2, #1
 	beq	L_aes_encrypt_blocks_arm32_crypto_192_start_1
 L_aes_encrypt_blocks_arm32_crypto_192_start_4:
@@ -1197,7 +1197,7 @@ L_aes_encrypt_blocks_arm32_crypto_192_start_4:
 L_aes_encrypt_blocks_arm32_crypto_192_start_2:
 	cmp	r2, #2
 	blt	L_aes_encrypt_blocks_arm32_crypto_192_start_1
-	vld1.8	{q12, q13}, [r0]!
+	vld1.8	{q12-q13}, [r0]!
 	aese.8	q12, q0
 	aesmc.8	q12, q12
 	aese.8	q13, q0
@@ -1251,7 +1251,7 @@ L_aes_encrypt_blocks_arm32_crypto_192_start_2:
 	veor.32	q13, q13, q10
 	sub	r3, r3, #48
 	sub	r2, r2, #2
-	vst1.8	{q12, q13}, [r1]!
+	vst1.8	{q12-q13}, [r1]!
 L_aes_encrypt_blocks_arm32_crypto_192_start_1:
 	cmp	r2, #0
 	beq	L_aes_encrypt_blocks_arm32_crypto_192_done
@@ -1291,7 +1291,7 @@ L_aes_encrypt_blocks_arm32_crypto_192_done:
 	# AES_ECB_256
 L_aes_encrypt_blocks_arm32_crypto_start_256:
 #ifndef NO_AES_256
-	vld1.32	{q8, q9}, [r3]!
+	vld1.32	{q8-q9}, [r3]!
 	cmp	r2, #1
 	beq	L_aes_encrypt_blocks_arm32_crypto_256_start_1
 L_aes_encrypt_blocks_arm32_crypto_256_start_4:
@@ -1423,7 +1423,7 @@ L_aes_encrypt_blocks_arm32_crypto_256_start_4:
 L_aes_encrypt_blocks_arm32_crypto_256_start_2:
 	cmp	r2, #2
 	blt	L_aes_encrypt_blocks_arm32_crypto_256_start_1
-	vld1.8	{q12, q13}, [r0]!
+	vld1.8	{q12-q13}, [r0]!
 	aese.8	q12, q0
 	aesmc.8	q12, q12
 	aese.8	q13, q0
@@ -1487,7 +1487,7 @@ L_aes_encrypt_blocks_arm32_crypto_256_start_2:
 	veor.32	q13, q13, q10
 	sub	r3, r3, #0x50
 	sub	r2, r2, #2
-	vst1.8	{q12, q13}, [r1]!
+	vst1.8	{q12-q13}, [r1]!
 L_aes_encrypt_blocks_arm32_crypto_256_start_1:
 	cmp	r2, #0
 	beq	L_aes_encrypt_blocks_arm32_crypto_256_done
@@ -1627,7 +1627,7 @@ L_aes_encrypt_blocks_arm32_crypto_128_start_4:
 L_aes_encrypt_blocks_arm32_crypto_128_start_2:
 	cmp	r2, #2
 	blt	L_aes_encrypt_blocks_arm32_crypto_128_start_1
-	vld1.8	{q12, q13}, [r0]!
+	vld1.8	{q12-q13}, [r0]!
 	aese.8	q12, q0
 	aesmc.8	q12, q12
 	aese.8	q13, q0
@@ -1669,7 +1669,7 @@ L_aes_encrypt_blocks_arm32_crypto_128_start_2:
 	aese.8	q13, q9
 	veor.32	q13, q13, q10
 	sub	r2, r2, #2
-	vst1.8	{q12, q13}, [r1]!
+	vst1.8	{q12-q13}, [r1]!
 L_aes_encrypt_blocks_arm32_crypto_128_start_1:
 	cmp	r2, #0
 	beq	L_aes_encrypt_blocks_arm32_crypto_128_done
@@ -1716,7 +1716,7 @@ AES_decrypt_blocks_AARCH32:
 	bgt	L_aes_decrypt_blocks_arm32_crypto_start_256
 	# AES_ECB_192
 #ifndef NO_AES_192
-	vld1.32	{q8, q9}, [r3]!
+	vld1.32	{q8-q9}, [r3]!
 	cmp	r2, #1
 	beq	L_aes_decrypt_blocks_arm32_crypto_192_start_1
 	cmp	r2, #4
@@ -1830,7 +1830,7 @@ L_aes_decrypt_blocks_arm32_crypto_192_start_4:
 L_aes_decrypt_blocks_arm32_crypto_192_start_2:
 	cmp	r2, #2
 	blt	L_aes_decrypt_blocks_arm32_crypto_192_start_1
-	vld1.8	{q12, q13}, [r0]!
+	vld1.8	{q12-q13}, [r0]!
 	aesd.8	q12, q0
 	aesimc.8	q12, q12
 	aesd.8	q13, q0
@@ -1884,7 +1884,7 @@ L_aes_decrypt_blocks_arm32_crypto_192_start_2:
 	veor.32	q13, q13, q10
 	sub	r3, r3, #48
 	sub	r2, r2, #2
-	vst1.8	{q12, q13}, [r1]!
+	vst1.8	{q12-q13}, [r1]!
 L_aes_decrypt_blocks_arm32_crypto_192_start_1:
 	cmp	r2, #0
 	beq	L_aes_decrypt_blocks_arm32_crypto_192_done
@@ -1924,7 +1924,7 @@ L_aes_decrypt_blocks_arm32_crypto_192_done:
 	# AES_ECB_256
 L_aes_decrypt_blocks_arm32_crypto_start_256:
 #ifndef NO_AES_256
-	vld1.32	{q8, q9}, [r3]!
+	vld1.32	{q8-q9}, [r3]!
 	cmp	r2, #1
 	beq	L_aes_decrypt_blocks_arm32_crypto_256_start_1
 	cmp	r2, #4
@@ -2056,7 +2056,7 @@ L_aes_decrypt_blocks_arm32_crypto_256_start_4:
 L_aes_decrypt_blocks_arm32_crypto_256_start_2:
 	cmp	r2, #2
 	blt	L_aes_decrypt_blocks_arm32_crypto_256_start_1
-	vld1.8	{q12, q13}, [r0]!
+	vld1.8	{q12-q13}, [r0]!
 	aesd.8	q12, q0
 	aesimc.8	q12, q12
 	aesd.8	q13, q0
@@ -2120,7 +2120,7 @@ L_aes_decrypt_blocks_arm32_crypto_256_start_2:
 	veor.32	q13, q13, q10
 	sub	r3, r3, #0x50
 	sub	r2, r2, #2
-	vst1.8	{q12, q13}, [r1]!
+	vst1.8	{q12-q13}, [r1]!
 L_aes_decrypt_blocks_arm32_crypto_256_start_1:
 	cmp	r2, #0
 	beq	L_aes_decrypt_blocks_arm32_crypto_256_done
@@ -2260,7 +2260,7 @@ L_aes_decrypt_blocks_arm32_crypto_128_start_4:
 L_aes_decrypt_blocks_arm32_crypto_128_start_2:
 	cmp	r2, #2
 	blt	L_aes_decrypt_blocks_arm32_crypto_128_start_1
-	vld1.8	{q12, q13}, [r0]!
+	vld1.8	{q12-q13}, [r0]!
 	aesd.8	q12, q0
 	aesimc.8	q12, q12
 	aesd.8	q13, q0
@@ -2302,7 +2302,7 @@ L_aes_decrypt_blocks_arm32_crypto_128_start_2:
 	aesd.8	q13, q9
 	veor.32	q13, q13, q10
 	sub	r2, r2, #2
-	vst1.8	{q12, q13}, [r1]!
+	vst1.8	{q12-q13}, [r1]!
 L_aes_decrypt_blocks_arm32_crypto_128_start_1:
 	cmp	r2, #0
 	beq	L_aes_decrypt_blocks_arm32_crypto_128_done
@@ -2568,7 +2568,7 @@ L_aes_cbc_encrypt_arm32_crypto_start_256:
 	vld1.8	{q14}, [r0]!
 	vldm.32	r12!, {q8-q11}
 	add	r12, r12, #16
-	vld1.32	{q12, q13}, [r12]
+	vld1.32	{q12-q13}, [r12]
 	sub	r12, r12, #16
 	cmp	r2, #1
 	beq	L_aes_cbc_encrypt_arm32_crypto_256_start_1
@@ -3020,7 +3020,7 @@ AES_CBC_decrypt_AARCH32:
 	cmp	r2, #1
 	beq	L_aes_cbc_decrypt_blocks_arm32_crypto_192_start_1
 L_aes_cbc_decrypt_blocks_arm32_crypto_192_start_2:
-	vld1.8	{q14, q15}, [r0]!
+	vld1.8	{q14-q15}, [r0]!
 	vmov	q11, q13
 	vmov	q12, q14
 	vmov	q13, q15
@@ -3080,7 +3080,7 @@ L_aes_cbc_decrypt_blocks_arm32_crypto_192_start_2:
 	cmp	r2, #1
 	veor.32	q14, q14, q11
 	veor.32	q15, q15, q12
-	vst1.8	{q14, q15}, [r1]!
+	vst1.8	{q14-q15}, [r1]!
 	sub	r12, r12, #48
 	blt	L_aes_cbc_decrypt_blocks_arm32_crypto_192_done
 	bgt	L_aes_cbc_decrypt_blocks_arm32_crypto_192_start_2
@@ -3128,7 +3128,7 @@ L_aes_cbc_decrypt_blocks_arm32_crypto_start_256:
 	cmp	r2, #1
 	beq	L_aes_cbc_decrypt_blocks_arm32_crypto_256_start_1
 L_aes_cbc_decrypt_blocks_arm32_crypto_256_start_2:
-	vld1.8	{q14, q15}, [r0]!
+	vld1.8	{q14-q15}, [r0]!
 	vmov	q11, q13
 	vmov	q12, q14
 	vmov	q13, q15
@@ -3198,7 +3198,7 @@ L_aes_cbc_decrypt_blocks_arm32_crypto_256_start_2:
 	cmp	r2, #1
 	veor.32	q14, q14, q11
 	veor.32	q15, q15, q12
-	vst1.8	{q14, q15}, [r1]!
+	vst1.8	{q14-q15}, [r1]!
 	sub	r12, r12, #0x50
 	blt	L_aes_cbc_decrypt_blocks_arm32_crypto_256_done
 	bgt	L_aes_cbc_decrypt_blocks_arm32_crypto_256_start_2
@@ -3252,7 +3252,7 @@ L_aes_cbc_decrypt_blocks_arm32_crypto_start_128:
 	cmp	r2, #1
 	beq	L_aes_cbc_decrypt_blocks_arm32_crypto_128_start_1
 L_aes_cbc_decrypt_blocks_arm32_crypto_128_start_2:
-	vld1.8	{q14, q15}, [r0]!
+	vld1.8	{q14-q15}, [r0]!
 	vmov	q11, q13
 	vmov	q12, q14
 	vmov	q13, q15
@@ -3300,7 +3300,7 @@ L_aes_cbc_decrypt_blocks_arm32_crypto_128_start_2:
 	cmp	r2, #1
 	veor.32	q14, q14, q11
 	veor.32	q15, q15, q12
-	vst1.8	{q14, q15}, [r1]!
+	vst1.8	{q14-q15}, [r1]!
 	blt	L_aes_cbc_decrypt_blocks_arm32_crypto_128_done
 	bgt	L_aes_cbc_decrypt_blocks_arm32_crypto_128_start_2
 L_aes_cbc_decrypt_blocks_arm32_crypto_128_start_1:
@@ -3431,7 +3431,7 @@ L_aes_ctr_encrypt_arm32_crypto_192_start_2:
 	veor.32	q0, q0, q15
 	veor.32	q1, q1, q15
 	adds	r8, r8, #1
-	vld1.8	{q14, q15}, [r0]!
+	vld1.8	{q14-q15}, [r0]!
 	adcs	r7, r7, #0
 	sub	r12, r12, #16
 	veor.32	q14, q14, q0
@@ -3442,7 +3442,7 @@ L_aes_ctr_encrypt_arm32_crypto_192_start_2:
 	vmov	d2, r5, r6
 	vmov	d3, r7, r8
 	cmp	r4, #1
-	vst1.8	{q14, q15}, [r1]!
+	vst1.8	{q14-q15}, [r1]!
 	vrev32.8	q1, q1
 	bgt	L_aes_ctr_encrypt_arm32_crypto_192_start_2
 	mov	lr, #0
@@ -3625,7 +3625,7 @@ L_aes_ctr_encrypt_arm32_crypto_256_start_2:
 	veor.32	q0, q0, q15
 	veor.32	q1, q1, q15
 	adcs	r6, r6, #0
-	vld1.8	{q14, q15}, [r0]!
+	vld1.8	{q14-q15}, [r0]!
 	sub	r12, r12, #48
 	veor.32	q14, q14, q0
 	veor.32	q15, q15, q1
@@ -3634,7 +3634,7 @@ L_aes_ctr_encrypt_arm32_crypto_256_start_2:
 	vmov	d2, r5, r6
 	vmov	d3, r7, r8
 	cmp	r4, #1
-	vst1.8	{q14, q15}, [r1]!
+	vst1.8	{q14-q15}, [r1]!
 	vrev32.8	q1, q1
 	bgt	L_aes_ctr_encrypt_arm32_crypto_256_start_2
 	mov	lr, #0
@@ -3804,7 +3804,7 @@ L_aes_ctr_encrypt_arm32_crypto_128_start_2:
 	aese.8	q1, q11
 	aesmc.8	q1, q1
 	adcs	r6, r6, #0
-	vld1.8	{q14, q15}, [r0]!
+	vld1.8	{q14-q15}, [r0]!
 	adc	r5, r5, #0
 	aese.8	q0, q12
 	aese.8	q1, q12
@@ -3817,7 +3817,7 @@ L_aes_ctr_encrypt_arm32_crypto_128_start_2:
 	vmov	d2, r5, r6
 	vmov	d3, r7, r8
 	cmp	r4, #1
-	vst1.8	{q14, q15}, [r1]!
+	vst1.8	{q14-q15}, [r1]!
 	vrev32.8	q1, q1
 	bgt	L_aes_ctr_encrypt_arm32_crypto_128_start_2
 	mov	lr, #0
@@ -3912,10 +3912,10 @@ L_aes_ctr_encrypt_arm32_crypto_done:
 	.globl	AES_GCM_set_key_AARCH32
 	.type	AES_GCM_set_key_AARCH32, %function
 AES_GCM_set_key_AARCH32:
-	vpush	{d8, d9}
+	vpush	{d8-d9}
 	vld1.8	{q0}, [r0]
-	vld1.8	{q1, q2}, [r1]!
-	vld1.8	{q3, q4}, [r1]!
+	vld1.8	{q1-q2}, [r1]!
+	vld1.8	{q3-q4}, [r1]!
 	aese.8	q0, q1
 	aesmc.8	q0, q0
 	aese.8	q0, q2
@@ -3924,8 +3924,8 @@ AES_GCM_set_key_AARCH32:
 	aesmc.8	q0, q0
 	aese.8	q0, q4
 	aesmc.8	q0, q0
-	vld1.8	{q1, q2}, [r1]!
-	vld1.8	{q3, q4}, [r1]!
+	vld1.8	{q1-q2}, [r1]!
+	vld1.8	{q3-q4}, [r1]!
 	aese.8	q0, q1
 	aesmc.8	q0, q0
 	aese.8	q0, q2
@@ -3935,19 +3935,19 @@ AES_GCM_set_key_AARCH32:
 	aese.8	q0, q4
 	aesmc.8	q0, q0
 	subs	r3, r3, #10
-	vld1.8	{q1, q2}, [r1]!
+	vld1.8	{q1-q2}, [r1]!
 	aese.8	q0, q1
 	aesmc.8	q0, q0
 	aese.8	q0, q2
 	beq	L_aes_gcm_set_key_arm32_crypto_round_done
-	vld1.8	{q1, q2}, [r1]!
+	vld1.8	{q1-q2}, [r1]!
 	subs	r3, r3, #2
 	aesmc.8	q0, q0
 	aese.8	q0, q1
 	aesmc.8	q0, q0
 	aese.8	q0, q2
 	beq	L_aes_gcm_set_key_arm32_crypto_round_done
-	vld1.8	{q1, q2}, [r1]!
+	vld1.8	{q1-q2}, [r1]!
 	aesmc.8	q0, q0
 	aese.8	q0, q1
 	aesmc.8	q0, q0
@@ -3966,7 +3966,7 @@ L_aes_gcm_set_key_arm32_crypto_round_done:
 	vshl.u8	q0, q2, #4
 	vsri.u8	q0, q2, #4
 	vst1.32	{q0}, [r2]
-	vpop	{d8, d9}
+	vpop	{d8-d9}
 	bx	lr
 	.size	AES_GCM_set_key_AARCH32,.-AES_GCM_set_key_AARCH32
 	.text
@@ -4207,11 +4207,11 @@ L_aes_gcm_encrypt_arm32_crypto_192_start_2:
 	veor.8	q4, q4, q15
 	aese.8	q5, q14
 	veor.8	q5, q5, q15
-	vld1.8	{q14, q15}, [r0]!
+	vld1.8	{q14-q15}, [r0]!
 	sub	r7, r7, #16
 	veor.8	q14, q14, q4
 	veor.8	q15, q15, q5
-	vst1.8	{q14, q15}, [r1]!
+	vst1.8	{q14-q15}, [r1]!
 	cmp	r10, #1
 	bgt	L_aes_gcm_encrypt_arm32_crypto_192_start_2
 	blt	L_aes_gcm_encrypt_arm32_crypto_192_done
@@ -4447,11 +4447,11 @@ L_aes_gcm_encrypt_arm32_crypto_256_start_2:
 	veor.8	q4, q4, q15
 	aese.8	q5, q14
 	veor.8	q5, q5, q15
-	vld1.8	{q14, q15}, [r0]!
+	vld1.8	{q14-q15}, [r0]!
 	sub	r7, r7, #48
 	veor.8	q14, q14, q4
 	veor.8	q15, q15, q5
-	vst1.8	{q14, q15}, [r1]!
+	vst1.8	{q14-q15}, [r1]!
 	cmp	r10, #1
 	bgt	L_aes_gcm_encrypt_arm32_crypto_256_start_2
 	blt	L_aes_gcm_encrypt_arm32_crypto_256_done
@@ -4681,14 +4681,14 @@ L_aes_gcm_encrypt_arm32_crypto_128_start_2:
 	aesmc.8	q4, q4
 	aese.8	q5, q11
 	aesmc.8	q5, q5
-	vld1.8	{q14, q15}, [r0]!
+	vld1.8	{q14-q15}, [r0]!
 	aese.8	q4, q12
 	veor.8	q4, q4, q13
 	aese.8	q5, q12
 	veor.8	q5, q5, q13
 	veor.8	q14, q14, q4
 	veor.8	q15, q15, q5
-	vst1.8	{q14, q15}, [r1]!
+	vst1.8	{q14-q15}, [r1]!
 	cmp	r10, #1
 	bgt	L_aes_gcm_encrypt_arm32_crypto_128_start_2
 	blt	L_aes_gcm_encrypt_arm32_crypto_128_done
@@ -4973,7 +4973,7 @@ L_aes_gcm_encrypt_arm32_crypto_aad_start_4:
 	blt	L_aes_gcm_encrypt_arm32_crypto_aad_done
 	beq	L_aes_gcm_encrypt_arm32_crypto_aad_start_1
 L_aes_gcm_encrypt_arm32_crypto_aad_start_2:
-	vld1.32	{q14, q15}, [r5]!
+	vld1.32	{q14-q15}, [r5]!
 	vmov.i8	q12, #0x55
 	vshl.u8	q0, q14, #1
 	vshl.u8	q1, q15, #1
@@ -5204,7 +5204,7 @@ L_aes_gcm_encrypt_arm32_crypto_out_start_4:
 	blt	L_aes_gcm_encrypt_arm32_crypto_out_done
 	beq	L_aes_gcm_encrypt_arm32_crypto_out_start_1
 L_aes_gcm_encrypt_arm32_crypto_out_start_2:
-	vld1.32	{q14, q15}, [r1]!
+	vld1.32	{q14-q15}, [r1]!
 	vmov.i8	q12, #0x55
 	vshl.u8	q0, q14, #1
 	vshl.u8	q1, q15, #1
@@ -5567,7 +5567,7 @@ L_aes_gcm_decrypt_arm32_crypto_aad_start_4:
 	blt	L_aes_gcm_decrypt_arm32_crypto_aad_done
 	beq	L_aes_gcm_decrypt_arm32_crypto_aad_start_1
 L_aes_gcm_decrypt_arm32_crypto_aad_start_2:
-	vld1.32	{q14, q15}, [r5]!
+	vld1.32	{q14-q15}, [r5]!
 	vmov.i8	q12, #0x55
 	vshl.u8	q0, q14, #1
 	vshl.u8	q1, q15, #1
@@ -5798,7 +5798,7 @@ L_aes_gcm_decrypt_arm32_crypto_in_start_4:
 	blt	L_aes_gcm_decrypt_arm32_crypto_in_done
 	beq	L_aes_gcm_decrypt_arm32_crypto_in_start_1
 L_aes_gcm_decrypt_arm32_crypto_in_start_2:
-	vld1.32	{q14, q15}, [r0]!
+	vld1.32	{q14-q15}, [r0]!
 	vmov.i8	q12, #0x55
 	vshl.u8	q0, q14, #1
 	vshl.u8	q1, q15, #1
@@ -6156,11 +6156,11 @@ L_aes_gcm_decrypt_arm32_crypto_192_start_2:
 	veor.8	q4, q4, q15
 	aese.8	q5, q14
 	veor.8	q5, q5, q15
-	vld1.8	{q14, q15}, [r0]!
+	vld1.8	{q14-q15}, [r0]!
 	sub	r7, r7, #16
 	veor.8	q14, q14, q4
 	veor.8	q15, q15, q5
-	vst1.8	{q14, q15}, [r1]!
+	vst1.8	{q14-q15}, [r1]!
 	cmp	r10, #1
 	bgt	L_aes_gcm_decrypt_arm32_crypto_192_start_2
 	blt	L_aes_gcm_decrypt_arm32_crypto_192_done
@@ -6396,11 +6396,11 @@ L_aes_gcm_decrypt_arm32_crypto_256_start_2:
 	veor.8	q4, q4, q15
 	aese.8	q5, q14
 	veor.8	q5, q5, q15
-	vld1.8	{q14, q15}, [r0]!
+	vld1.8	{q14-q15}, [r0]!
 	sub	r7, r7, #48
 	veor.8	q14, q14, q4
 	veor.8	q15, q15, q5
-	vst1.8	{q14, q15}, [r1]!
+	vst1.8	{q14-q15}, [r1]!
 	cmp	r10, #1
 	bgt	L_aes_gcm_decrypt_arm32_crypto_256_start_2
 	blt	L_aes_gcm_decrypt_arm32_crypto_256_done
@@ -6630,14 +6630,14 @@ L_aes_gcm_decrypt_arm32_crypto_128_start_2:
 	aesmc.8	q4, q4
 	aese.8	q5, q11
 	aesmc.8	q5, q5
-	vld1.8	{q14, q15}, [r0]!
+	vld1.8	{q14-q15}, [r0]!
 	aese.8	q4, q12
 	veor.8	q4, q4, q13
 	aese.8	q5, q12
 	veor.8	q5, q5, q13
 	veor.8	q14, q14, q4
 	veor.8	q15, q15, q5
-	vst1.8	{q14, q15}, [r1]!
+	vst1.8	{q14-q15}, [r1]!
 	cmp	r10, #1
 	bgt	L_aes_gcm_decrypt_arm32_crypto_128_start_2
 	blt	L_aes_gcm_decrypt_arm32_crypto_128_done
@@ -7820,551 +7820,203 @@ L_aes_xts_decrypt_arm32_crypto_done:
 #endif /* WOLFSSL_AES_XTS */
 #else
 #ifdef HAVE_AES_DECRYPT
+#ifndef __APPLE__
 	.text
 	.type	L_AES_ARM32_td_data, %object
 	.size	L_AES_ARM32_td_data, 1024
-	.align	4
+#else
+	.section	__DATA,__data
+#endif /* __APPLE__ */
+	# 8-byte aligned, 64-bit aligned
+#ifndef __APPLE__
+	.align	3
+#else
+	.p2align	3
+#endif /* __APPLE__ */
 L_AES_ARM32_td_data:
-	.word	0x5051f4a7
-	.word	0x537e4165
-	.word	0xc31a17a4
-	.word	0x963a275e
-	.word	0xcb3bab6b
-	.word	0xf11f9d45
-	.word	0xabacfa58
-	.word	0x934be303
-	.word	0x552030fa
-	.word	0xf6ad766d
-	.word	0x9188cc76
-	.word	0x25f5024c
-	.word	0xfc4fe5d7
-	.word	0xd7c52acb
-	.word	0x80263544
-	.word	0x8fb562a3
-	.word	0x49deb15a
-	.word	0x6725ba1b
-	.word	0x9845ea0e
-	.word	0xe15dfec0
-	.word	0x2c32f75
-	.word	0x12814cf0
-	.word	0xa38d4697
-	.word	0xc66bd3f9
-	.word	0xe7038f5f
-	.word	0x9515929c
-	.word	0xebbf6d7a
-	.word	0xda955259
-	.word	0x2dd4be83
-	.word	0xd3587421
-	.word	0x2949e069
-	.word	0x448ec9c8
-	.word	0x6a75c289
-	.word	0x78f48e79
-	.word	0x6b99583e
-	.word	0xdd27b971
-	.word	0xb6bee14f
-	.word	0x17f088ad
-	.word	0x66c920ac
-	.word	0xb47dce3a
-	.word	0x1863df4a
-	.word	0x82e51a31
-	.word	0x60975133
-	.word	0x4562537f
-	.word	0xe0b16477
-	.word	0x84bb6bae
-	.word	0x1cfe81a0
-	.word	0x94f9082b
-	.word	0x58704868
-	.word	0x198f45fd
-	.word	0x8794de6c
-	.word	0xb7527bf8
-	.word	0x23ab73d3
-	.word	0xe2724b02
-	.word	0x57e31f8f
-	.word	0x2a6655ab
-	.word	0x7b2eb28
-	.word	0x32fb5c2
-	.word	0x9a86c57b
-	.word	0xa5d33708
-	.word	0xf2302887
-	.word	0xb223bfa5
-	.word	0xba02036a
-	.word	0x5ced1682
-	.word	0x2b8acf1c
-	.word	0x92a779b4
-	.word	0xf0f307f2
-	.word	0xa14e69e2
-	.word	0xcd65daf4
-	.word	0xd50605be
-	.word	0x1fd13462
-	.word	0x8ac4a6fe
-	.word	0x9d342e53
-	.word	0xa0a2f355
-	.word	0x32058ae1
-	.word	0x75a4f6eb
-	.word	0x390b83ec
-	.word	0xaa4060ef
-	.word	0x65e719f
-	.word	0x51bd6e10
-	.word	0xf93e218a
-	.word	0x3d96dd06
-	.word	0xaedd3e05
-	.word	0x464de6bd
-	.word	0xb591548d
-	.word	0x571c45d
-	.word	0x6f0406d4
-	.word	0xff605015
-	.word	0x241998fb
-	.word	0x97d6bde9
-	.word	0xcc894043
-	.word	0x7767d99e
-	.word	0xbdb0e842
-	.word	0x8807898b
-	.word	0x38e7195b
-	.word	0xdb79c8ee
-	.word	0x47a17c0a
-	.word	0xe97c420f
-	.word	0xc9f8841e
-	.word	0x0
-	.word	0x83098086
-	.word	0x48322bed
-	.word	0xac1e1170
-	.word	0x4e6c5a72
-	.word	0xfbfd0eff
-	.word	0x560f8538
-	.word	0x1e3daed5
-	.word	0x27362d39
-	.word	0x640a0fd9
-	.word	0x21685ca6
-	.word	0xd19b5b54
-	.word	0x3a24362e
-	.word	0xb10c0a67
-	.word	0xf9357e7
-	.word	0xd2b4ee96
-	.word	0x9e1b9b91
-	.word	0x4f80c0c5
-	.word	0xa261dc20
-	.word	0x695a774b
-	.word	0x161c121a
-	.word	0xae293ba
-	.word	0xe5c0a02a
-	.word	0x433c22e0
-	.word	0x1d121b17
-	.word	0xb0e090d
-	.word	0xadf28bc7
-	.word	0xb92db6a8
-	.word	0xc8141ea9
-	.word	0x8557f119
-	.word	0x4caf7507
-	.word	0xbbee99dd
-	.word	0xfda37f60
-	.word	0x9ff70126
-	.word	0xbc5c72f5
-	.word	0xc544663b
-	.word	0x345bfb7e
-	.word	0x768b4329
-	.word	0xdccb23c6
-	.word	0x68b6edfc
-	.word	0x63b8e4f1
-	.word	0xcad731dc
-	.word	0x10426385
-	.word	0x40139722
-	.word	0x2084c611
-	.word	0x7d854a24
-	.word	0xf8d2bb3d
-	.word	0x11aef932
-	.word	0x6dc729a1
-	.word	0x4b1d9e2f
-	.word	0xf3dcb230
-	.word	0xec0d8652
-	.word	0xd077c1e3
-	.word	0x6c2bb316
-	.word	0x99a970b9
-	.word	0xfa119448
-	.word	0x2247e964
-	.word	0xc4a8fc8c
-	.word	0x1aa0f03f
-	.word	0xd8567d2c
-	.word	0xef223390
-	.word	0xc787494e
-	.word	0xc1d938d1
-	.word	0xfe8ccaa2
-	.word	0x3698d40b
-	.word	0xcfa6f581
-	.word	0x28a57ade
-	.word	0x26dab78e
-	.word	0xa43fadbf
-	.word	0xe42c3a9d
-	.word	0xd507892
-	.word	0x9b6a5fcc
-	.word	0x62547e46
-	.word	0xc2f68d13
-	.word	0xe890d8b8
-	.word	0x5e2e39f7
-	.word	0xf582c3af
-	.word	0xbe9f5d80
-	.word	0x7c69d093
-	.word	0xa96fd52d
-	.word	0xb3cf2512
-	.word	0x3bc8ac99
-	.word	0xa710187d
-	.word	0x6ee89c63
-	.word	0x7bdb3bbb
-	.word	0x9cd2678
-	.word	0xf46e5918
-	.word	0x1ec9ab7
-	.word	0xa8834f9a
-	.word	0x65e6956e
-	.word	0x7eaaffe6
-	.word	0x821bccf
-	.word	0xe6ef15e8
-	.word	0xd9bae79b
-	.word	0xce4a6f36
-	.word	0xd4ea9f09
-	.word	0xd629b07c
-	.word	0xaf31a4b2
-	.word	0x312a3f23
-	.word	0x30c6a594
-	.word	0xc035a266
-	.word	0x37744ebc
-	.word	0xa6fc82ca
-	.word	0xb0e090d0
-	.word	0x1533a7d8
-	.word	0x4af10498
-	.word	0xf741ecda
-	.word	0xe7fcd50
-	.word	0x2f1791f6
-	.word	0x8d764dd6
-	.word	0x4d43efb0
-	.word	0x54ccaa4d
-	.word	0xdfe49604
-	.word	0xe39ed1b5
-	.word	0x1b4c6a88
-	.word	0xb8c12c1f
-	.word	0x7f466551
-	.word	0x49d5eea
-	.word	0x5d018c35
-	.word	0x73fa8774
-	.word	0x2efb0b41
-	.word	0x5ab3671d
-	.word	0x5292dbd2
-	.word	0x33e91056
-	.word	0x136dd647
-	.word	0x8c9ad761
-	.word	0x7a37a10c
-	.word	0x8e59f814
-	.word	0x89eb133c
-	.word	0xeecea927
-	.word	0x35b761c9
-	.word	0xede11ce5
-	.word	0x3c7a47b1
-	.word	0x599cd2df
-	.word	0x3f55f273
-	.word	0x791814ce
-	.word	0xbf73c737
-	.word	0xea53f7cd
-	.word	0x5b5ffdaa
-	.word	0x14df3d6f
-	.word	0x867844db
-	.word	0x81caaff3
-	.word	0x3eb968c4
-	.word	0x2c382434
-	.word	0x5fc2a340
-	.word	0x72161dc3
-	.word	0xcbce225
-	.word	0x8b283c49
-	.word	0x41ff0d95
-	.word	0x7139a801
-	.word	0xde080cb3
-	.word	0x9cd8b4e4
-	.word	0x906456c1
-	.word	0x617bcb84
-	.word	0x70d532b6
-	.word	0x74486c5c
-	.word	0x42d0b857
+	.long	0x5051f4a7,0x537e4165,0xc31a17a4,0x963a275e
+	.long	0xcb3bab6b,0xf11f9d45,0xabacfa58,0x934be303
+	.long	0x552030fa,0xf6ad766d,0x9188cc76,0x25f5024c
+	.long	0xfc4fe5d7,0xd7c52acb,0x80263544,0x8fb562a3
+	.long	0x49deb15a,0x6725ba1b,0x9845ea0e,0xe15dfec0
+	.long	0x02c32f75,0x12814cf0,0xa38d4697,0xc66bd3f9
+	.long	0xe7038f5f,0x9515929c,0xebbf6d7a,0xda955259
+	.long	0x2dd4be83,0xd3587421,0x2949e069,0x448ec9c8
+	.long	0x6a75c289,0x78f48e79,0x6b99583e,0xdd27b971
+	.long	0xb6bee14f,0x17f088ad,0x66c920ac,0xb47dce3a
+	.long	0x1863df4a,0x82e51a31,0x60975133,0x4562537f
+	.long	0xe0b16477,0x84bb6bae,0x1cfe81a0,0x94f9082b
+	.long	0x58704868,0x198f45fd,0x8794de6c,0xb7527bf8
+	.long	0x23ab73d3,0xe2724b02,0x57e31f8f,0x2a6655ab
+	.long	0x07b2eb28,0x032fb5c2,0x9a86c57b,0xa5d33708
+	.long	0xf2302887,0xb223bfa5,0xba02036a,0x5ced1682
+	.long	0x2b8acf1c,0x92a779b4,0xf0f307f2,0xa14e69e2
+	.long	0xcd65daf4,0xd50605be,0x1fd13462,0x8ac4a6fe
+	.long	0x9d342e53,0xa0a2f355,0x32058ae1,0x75a4f6eb
+	.long	0x390b83ec,0xaa4060ef,0x065e719f,0x51bd6e10
+	.long	0xf93e218a,0x3d96dd06,0xaedd3e05,0x464de6bd
+	.long	0xb591548d,0x0571c45d,0x6f0406d4,0xff605015
+	.long	0x241998fb,0x97d6bde9,0xcc894043,0x7767d99e
+	.long	0xbdb0e842,0x8807898b,0x38e7195b,0xdb79c8ee
+	.long	0x47a17c0a,0xe97c420f,0xc9f8841e,0x00000000
+	.long	0x83098086,0x48322bed,0xac1e1170,0x4e6c5a72
+	.long	0xfbfd0eff,0x560f8538,0x1e3daed5,0x27362d39
+	.long	0x640a0fd9,0x21685ca6,0xd19b5b54,0x3a24362e
+	.long	0xb10c0a67,0x0f9357e7,0xd2b4ee96,0x9e1b9b91
+	.long	0x4f80c0c5,0xa261dc20,0x695a774b,0x161c121a
+	.long	0x0ae293ba,0xe5c0a02a,0x433c22e0,0x1d121b17
+	.long	0x0b0e090d,0xadf28bc7,0xb92db6a8,0xc8141ea9
+	.long	0x8557f119,0x4caf7507,0xbbee99dd,0xfda37f60
+	.long	0x9ff70126,0xbc5c72f5,0xc544663b,0x345bfb7e
+	.long	0x768b4329,0xdccb23c6,0x68b6edfc,0x63b8e4f1
+	.long	0xcad731dc,0x10426385,0x40139722,0x2084c611
+	.long	0x7d854a24,0xf8d2bb3d,0x11aef932,0x6dc729a1
+	.long	0x4b1d9e2f,0xf3dcb230,0xec0d8652,0xd077c1e3
+	.long	0x6c2bb316,0x99a970b9,0xfa119448,0x2247e964
+	.long	0xc4a8fc8c,0x1aa0f03f,0xd8567d2c,0xef223390
+	.long	0xc787494e,0xc1d938d1,0xfe8ccaa2,0x3698d40b
+	.long	0xcfa6f581,0x28a57ade,0x26dab78e,0xa43fadbf
+	.long	0xe42c3a9d,0x0d507892,0x9b6a5fcc,0x62547e46
+	.long	0xc2f68d13,0xe890d8b8,0x5e2e39f7,0xf582c3af
+	.long	0xbe9f5d80,0x7c69d093,0xa96fd52d,0xb3cf2512
+	.long	0x3bc8ac99,0xa710187d,0x6ee89c63,0x7bdb3bbb
+	.long	0x09cd2678,0xf46e5918,0x01ec9ab7,0xa8834f9a
+	.long	0x65e6956e,0x7eaaffe6,0x0821bccf,0xe6ef15e8
+	.long	0xd9bae79b,0xce4a6f36,0xd4ea9f09,0xd629b07c
+	.long	0xaf31a4b2,0x312a3f23,0x30c6a594,0xc035a266
+	.long	0x37744ebc,0xa6fc82ca,0xb0e090d0,0x1533a7d8
+	.long	0x4af10498,0xf741ecda,0x0e7fcd50,0x2f1791f6
+	.long	0x8d764dd6,0x4d43efb0,0x54ccaa4d,0xdfe49604
+	.long	0xe39ed1b5,0x1b4c6a88,0xb8c12c1f,0x7f466551
+	.long	0x049d5eea,0x5d018c35,0x73fa8774,0x2efb0b41
+	.long	0x5ab3671d,0x5292dbd2,0x33e91056,0x136dd647
+	.long	0x8c9ad761,0x7a37a10c,0x8e59f814,0x89eb133c
+	.long	0xeecea927,0x35b761c9,0xede11ce5,0x3c7a47b1
+	.long	0x599cd2df,0x3f55f273,0x791814ce,0xbf73c737
+	.long	0xea53f7cd,0x5b5ffdaa,0x14df3d6f,0x867844db
+	.long	0x81caaff3,0x3eb968c4,0x2c382434,0x5fc2a340
+	.long	0x72161dc3,0x0cbce225,0x8b283c49,0x41ff0d95
+	.long	0x7139a801,0xde080cb3,0x9cd8b4e4,0x906456c1
+	.long	0x617bcb84,0x70d532b6,0x74486c5c,0x42d0b857
 #endif /* HAVE_AES_DECRYPT */
 #if defined(HAVE_AES_DECRYPT) || defined(HAVE_AES_CBC) || \
     defined(HAVE_AESCCM) || defined(HAVE_AESGCM) || \
     defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER)
+#ifndef __APPLE__
 	.text
 	.type	L_AES_ARM32_te_data, %object
 	.size	L_AES_ARM32_te_data, 1024
-	.align	4
+#else
+	.section	__DATA,__data
+#endif /* __APPLE__ */
+	# 8-byte aligned, 64-bit aligned
+#ifndef __APPLE__
+	.align	3
+#else
+	.p2align	3
+#endif /* __APPLE__ */
 L_AES_ARM32_te_data:
-	.word	0xa5c66363
-	.word	0x84f87c7c
-	.word	0x99ee7777
-	.word	0x8df67b7b
-	.word	0xdfff2f2
-	.word	0xbdd66b6b
-	.word	0xb1de6f6f
-	.word	0x5491c5c5
-	.word	0x50603030
-	.word	0x3020101
-	.word	0xa9ce6767
-	.word	0x7d562b2b
-	.word	0x19e7fefe
-	.word	0x62b5d7d7
-	.word	0xe64dabab
-	.word	0x9aec7676
-	.word	0x458fcaca
-	.word	0x9d1f8282
-	.word	0x4089c9c9
-	.word	0x87fa7d7d
-	.word	0x15effafa
-	.word	0xebb25959
-	.word	0xc98e4747
-	.word	0xbfbf0f0
-	.word	0xec41adad
-	.word	0x67b3d4d4
-	.word	0xfd5fa2a2
-	.word	0xea45afaf
-	.word	0xbf239c9c
-	.word	0xf753a4a4
-	.word	0x96e47272
-	.word	0x5b9bc0c0
-	.word	0xc275b7b7
-	.word	0x1ce1fdfd
-	.word	0xae3d9393
-	.word	0x6a4c2626
-	.word	0x5a6c3636
-	.word	0x417e3f3f
-	.word	0x2f5f7f7
-	.word	0x4f83cccc
-	.word	0x5c683434
-	.word	0xf451a5a5
-	.word	0x34d1e5e5
-	.word	0x8f9f1f1
-	.word	0x93e27171
-	.word	0x73abd8d8
-	.word	0x53623131
-	.word	0x3f2a1515
-	.word	0xc080404
-	.word	0x5295c7c7
-	.word	0x65462323
-	.word	0x5e9dc3c3
-	.word	0x28301818
-	.word	0xa1379696
-	.word	0xf0a0505
-	.word	0xb52f9a9a
-	.word	0x90e0707
-	.word	0x36241212
-	.word	0x9b1b8080
-	.word	0x3ddfe2e2
-	.word	0x26cdebeb
-	.word	0x694e2727
-	.word	0xcd7fb2b2
-	.word	0x9fea7575
-	.word	0x1b120909
-	.word	0x9e1d8383
-	.word	0x74582c2c
-	.word	0x2e341a1a
-	.word	0x2d361b1b
-	.word	0xb2dc6e6e
-	.word	0xeeb45a5a
-	.word	0xfb5ba0a0
-	.word	0xf6a45252
-	.word	0x4d763b3b
-	.word	0x61b7d6d6
-	.word	0xce7db3b3
-	.word	0x7b522929
-	.word	0x3edde3e3
-	.word	0x715e2f2f
-	.word	0x97138484
-	.word	0xf5a65353
-	.word	0x68b9d1d1
-	.word	0x0
-	.word	0x2cc1eded
-	.word	0x60402020
-	.word	0x1fe3fcfc
-	.word	0xc879b1b1
-	.word	0xedb65b5b
-	.word	0xbed46a6a
-	.word	0x468dcbcb
-	.word	0xd967bebe
-	.word	0x4b723939
-	.word	0xde944a4a
-	.word	0xd4984c4c
-	.word	0xe8b05858
-	.word	0x4a85cfcf
-	.word	0x6bbbd0d0
-	.word	0x2ac5efef
-	.word	0xe54faaaa
-	.word	0x16edfbfb
-	.word	0xc5864343
-	.word	0xd79a4d4d
-	.word	0x55663333
-	.word	0x94118585
-	.word	0xcf8a4545
-	.word	0x10e9f9f9
-	.word	0x6040202
-	.word	0x81fe7f7f
-	.word	0xf0a05050
-	.word	0x44783c3c
-	.word	0xba259f9f
-	.word	0xe34ba8a8
-	.word	0xf3a25151
-	.word	0xfe5da3a3
-	.word	0xc0804040
-	.word	0x8a058f8f
-	.word	0xad3f9292
-	.word	0xbc219d9d
-	.word	0x48703838
-	.word	0x4f1f5f5
-	.word	0xdf63bcbc
-	.word	0xc177b6b6
-	.word	0x75afdada
-	.word	0x63422121
-	.word	0x30201010
-	.word	0x1ae5ffff
-	.word	0xefdf3f3
-	.word	0x6dbfd2d2
-	.word	0x4c81cdcd
-	.word	0x14180c0c
-	.word	0x35261313
-	.word	0x2fc3ecec
-	.word	0xe1be5f5f
-	.word	0xa2359797
-	.word	0xcc884444
-	.word	0x392e1717
-	.word	0x5793c4c4
-	.word	0xf255a7a7
-	.word	0x82fc7e7e
-	.word	0x477a3d3d
-	.word	0xacc86464
-	.word	0xe7ba5d5d
-	.word	0x2b321919
-	.word	0x95e67373
-	.word	0xa0c06060
-	.word	0x98198181
-	.word	0xd19e4f4f
-	.word	0x7fa3dcdc
-	.word	0x66442222
-	.word	0x7e542a2a
-	.word	0xab3b9090
-	.word	0x830b8888
-	.word	0xca8c4646
-	.word	0x29c7eeee
-	.word	0xd36bb8b8
-	.word	0x3c281414
-	.word	0x79a7dede
-	.word	0xe2bc5e5e
-	.word	0x1d160b0b
-	.word	0x76addbdb
-	.word	0x3bdbe0e0
-	.word	0x56643232
-	.word	0x4e743a3a
-	.word	0x1e140a0a
-	.word	0xdb924949
-	.word	0xa0c0606
-	.word	0x6c482424
-	.word	0xe4b85c5c
-	.word	0x5d9fc2c2
-	.word	0x6ebdd3d3
-	.word	0xef43acac
-	.word	0xa6c46262
-	.word	0xa8399191
-	.word	0xa4319595
-	.word	0x37d3e4e4
-	.word	0x8bf27979
-	.word	0x32d5e7e7
-	.word	0x438bc8c8
-	.word	0x596e3737
-	.word	0xb7da6d6d
-	.word	0x8c018d8d
-	.word	0x64b1d5d5
-	.word	0xd29c4e4e
-	.word	0xe049a9a9
-	.word	0xb4d86c6c
-	.word	0xfaac5656
-	.word	0x7f3f4f4
-	.word	0x25cfeaea
-	.word	0xafca6565
-	.word	0x8ef47a7a
-	.word	0xe947aeae
-	.word	0x18100808
-	.word	0xd56fbaba
-	.word	0x88f07878
-	.word	0x6f4a2525
-	.word	0x725c2e2e
-	.word	0x24381c1c
-	.word	0xf157a6a6
-	.word	0xc773b4b4
-	.word	0x5197c6c6
-	.word	0x23cbe8e8
-	.word	0x7ca1dddd
-	.word	0x9ce87474
-	.word	0x213e1f1f
-	.word	0xdd964b4b
-	.word	0xdc61bdbd
-	.word	0x860d8b8b
-	.word	0x850f8a8a
-	.word	0x90e07070
-	.word	0x427c3e3e
-	.word	0xc471b5b5
-	.word	0xaacc6666
-	.word	0xd8904848
-	.word	0x5060303
-	.word	0x1f7f6f6
-	.word	0x121c0e0e
-	.word	0xa3c26161
-	.word	0x5f6a3535
-	.word	0xf9ae5757
-	.word	0xd069b9b9
-	.word	0x91178686
-	.word	0x5899c1c1
-	.word	0x273a1d1d
-	.word	0xb9279e9e
-	.word	0x38d9e1e1
-	.word	0x13ebf8f8
-	.word	0xb32b9898
-	.word	0x33221111
-	.word	0xbbd26969
-	.word	0x70a9d9d9
-	.word	0x89078e8e
-	.word	0xa7339494
-	.word	0xb62d9b9b
-	.word	0x223c1e1e
-	.word	0x92158787
-	.word	0x20c9e9e9
-	.word	0x4987cece
-	.word	0xffaa5555
-	.word	0x78502828
-	.word	0x7aa5dfdf
-	.word	0x8f038c8c
-	.word	0xf859a1a1
-	.word	0x80098989
-	.word	0x171a0d0d
-	.word	0xda65bfbf
-	.word	0x31d7e6e6
-	.word	0xc6844242
-	.word	0xb8d06868
-	.word	0xc3824141
-	.word	0xb0299999
-	.word	0x775a2d2d
-	.word	0x111e0f0f
-	.word	0xcb7bb0b0
-	.word	0xfca85454
-	.word	0xd66dbbbb
-	.word	0x3a2c1616
+	.long	0xa5c66363,0x84f87c7c,0x99ee7777,0x8df67b7b
+	.long	0x0dfff2f2,0xbdd66b6b,0xb1de6f6f,0x5491c5c5
+	.long	0x50603030,0x03020101,0xa9ce6767,0x7d562b2b
+	.long	0x19e7fefe,0x62b5d7d7,0xe64dabab,0x9aec7676
+	.long	0x458fcaca,0x9d1f8282,0x4089c9c9,0x87fa7d7d
+	.long	0x15effafa,0xebb25959,0xc98e4747,0x0bfbf0f0
+	.long	0xec41adad,0x67b3d4d4,0xfd5fa2a2,0xea45afaf
+	.long	0xbf239c9c,0xf753a4a4,0x96e47272,0x5b9bc0c0
+	.long	0xc275b7b7,0x1ce1fdfd,0xae3d9393,0x6a4c2626
+	.long	0x5a6c3636,0x417e3f3f,0x02f5f7f7,0x4f83cccc
+	.long	0x5c683434,0xf451a5a5,0x34d1e5e5,0x08f9f1f1
+	.long	0x93e27171,0x73abd8d8,0x53623131,0x3f2a1515
+	.long	0x0c080404,0x5295c7c7,0x65462323,0x5e9dc3c3
+	.long	0x28301818,0xa1379696,0x0f0a0505,0xb52f9a9a
+	.long	0x090e0707,0x36241212,0x9b1b8080,0x3ddfe2e2
+	.long	0x26cdebeb,0x694e2727,0xcd7fb2b2,0x9fea7575
+	.long	0x1b120909,0x9e1d8383,0x74582c2c,0x2e341a1a
+	.long	0x2d361b1b,0xb2dc6e6e,0xeeb45a5a,0xfb5ba0a0
+	.long	0xf6a45252,0x4d763b3b,0x61b7d6d6,0xce7db3b3
+	.long	0x7b522929,0x3edde3e3,0x715e2f2f,0x97138484
+	.long	0xf5a65353,0x68b9d1d1,0x00000000,0x2cc1eded
+	.long	0x60402020,0x1fe3fcfc,0xc879b1b1,0xedb65b5b
+	.long	0xbed46a6a,0x468dcbcb,0xd967bebe,0x4b723939
+	.long	0xde944a4a,0xd4984c4c,0xe8b05858,0x4a85cfcf
+	.long	0x6bbbd0d0,0x2ac5efef,0xe54faaaa,0x16edfbfb
+	.long	0xc5864343,0xd79a4d4d,0x55663333,0x94118585
+	.long	0xcf8a4545,0x10e9f9f9,0x06040202,0x81fe7f7f
+	.long	0xf0a05050,0x44783c3c,0xba259f9f,0xe34ba8a8
+	.long	0xf3a25151,0xfe5da3a3,0xc0804040,0x8a058f8f
+	.long	0xad3f9292,0xbc219d9d,0x48703838,0x04f1f5f5
+	.long	0xdf63bcbc,0xc177b6b6,0x75afdada,0x63422121
+	.long	0x30201010,0x1ae5ffff,0x0efdf3f3,0x6dbfd2d2
+	.long	0x4c81cdcd,0x14180c0c,0x35261313,0x2fc3ecec
+	.long	0xe1be5f5f,0xa2359797,0xcc884444,0x392e1717
+	.long	0x5793c4c4,0xf255a7a7,0x82fc7e7e,0x477a3d3d
+	.long	0xacc86464,0xe7ba5d5d,0x2b321919,0x95e67373
+	.long	0xa0c06060,0x98198181,0xd19e4f4f,0x7fa3dcdc
+	.long	0x66442222,0x7e542a2a,0xab3b9090,0x830b8888
+	.long	0xca8c4646,0x29c7eeee,0xd36bb8b8,0x3c281414
+	.long	0x79a7dede,0xe2bc5e5e,0x1d160b0b,0x76addbdb
+	.long	0x3bdbe0e0,0x56643232,0x4e743a3a,0x1e140a0a
+	.long	0xdb924949,0x0a0c0606,0x6c482424,0xe4b85c5c
+	.long	0x5d9fc2c2,0x6ebdd3d3,0xef43acac,0xa6c46262
+	.long	0xa8399191,0xa4319595,0x37d3e4e4,0x8bf27979
+	.long	0x32d5e7e7,0x438bc8c8,0x596e3737,0xb7da6d6d
+	.long	0x8c018d8d,0x64b1d5d5,0xd29c4e4e,0xe049a9a9
+	.long	0xb4d86c6c,0xfaac5656,0x07f3f4f4,0x25cfeaea
+	.long	0xafca6565,0x8ef47a7a,0xe947aeae,0x18100808
+	.long	0xd56fbaba,0x88f07878,0x6f4a2525,0x725c2e2e
+	.long	0x24381c1c,0xf157a6a6,0xc773b4b4,0x5197c6c6
+	.long	0x23cbe8e8,0x7ca1dddd,0x9ce87474,0x213e1f1f
+	.long	0xdd964b4b,0xdc61bdbd,0x860d8b8b,0x850f8a8a
+	.long	0x90e07070,0x427c3e3e,0xc471b5b5,0xaacc6666
+	.long	0xd8904848,0x05060303,0x01f7f6f6,0x121c0e0e
+	.long	0xa3c26161,0x5f6a3535,0xf9ae5757,0xd069b9b9
+	.long	0x91178686,0x5899c1c1,0x273a1d1d,0xb9279e9e
+	.long	0x38d9e1e1,0x13ebf8f8,0xb32b9898,0x33221111
+	.long	0xbbd26969,0x70a9d9d9,0x89078e8e,0xa7339494
+	.long	0xb62d9b9b,0x223c1e1e,0x92158787,0x20c9e9e9
+	.long	0x4987cece,0xffaa5555,0x78502828,0x7aa5dfdf
+	.long	0x8f038c8c,0xf859a1a1,0x80098989,0x171a0d0d
+	.long	0xda65bfbf,0x31d7e6e6,0xc6844242,0xb8d06868
+	.long	0xc3824141,0xb0299999,0x775a2d2d,0x111e0f0f
+	.long	0xcb7bb0b0,0xfca85454,0xd66dbbbb,0x3a2c1616
 #endif /* HAVE_AES_DECRYPT || HAVE_AES_CBC || HAVE_AESCCM || HAVE_AESGCM ||
         * WOLFSSL_AES_DIRECT || WOLFSSL_AES_COUNTER */
 #ifdef HAVE_AES_DECRYPT
+#ifndef __APPLE__
 	.text
 	.type	L_AES_ARM32_td, %object
 	.size	L_AES_ARM32_td, 12
-	.align	4
+#else
+	.section	__DATA,__data
+#endif /* __APPLE__ */
+	# 8-byte aligned, 64-bit aligned
+#ifndef __APPLE__
+	.align	3
+#else
+	.p2align	3
+#endif /* __APPLE__ */
 L_AES_ARM32_td:
-	.word	L_AES_ARM32_td_data
+	.long	L_AES_ARM32_td_data
 #endif /* HAVE_AES_DECRYPT */
 #if defined(HAVE_AES_DECRYPT) || defined(HAVE_AES_CBC) || \
     defined(HAVE_AESCCM) || defined(HAVE_AESGCM) || \
     defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER)
+#ifndef __APPLE__
 	.text
 	.type	L_AES_ARM32_te, %object
 	.size	L_AES_ARM32_te, 12
-	.align	4
+#else
+	.section	__DATA,__data
+#endif /* __APPLE__ */
+	# 8-byte aligned, 64-bit aligned
+#ifndef __APPLE__
+	.align	3
+#else
+	.p2align	3
+#endif /* __APPLE__ */
 L_AES_ARM32_te:
-	.word	L_AES_ARM32_te_data
+	.long	L_AES_ARM32_te_data
 #endif /* HAVE_AES_DECRYPT || HAVE_AES_CBC || HAVE_AESCCM || HAVE_AESGCM ||
         * WOLFSSL_AES_DIRECT || WOLFSSL_AES_COUNTER */
 #ifdef HAVE_AES_DECRYPT
@@ -8570,21 +8222,23 @@ L_AES_invert_key_mix_loop:
 	pop	{r4, r5, r6, r7, r8, r9, r10, r11, pc}
 	.size	AES_invert_key,.-AES_invert_key
 #endif /* HAVE_AES_DECRYPT */
+#ifndef __APPLE__
 	.text
 	.type	L_AES_ARM32_rcon, %object
 	.size	L_AES_ARM32_rcon, 40
-	.align	4
+#else
+	.section	__DATA,__data
+#endif /* __APPLE__ */
+	# 8-byte aligned, 64-bit aligned
+#ifndef __APPLE__
+	.align	3
+#else
+	.p2align	3
+#endif /* __APPLE__ */
 L_AES_ARM32_rcon:
-	.word	0x1000000
-	.word	0x2000000
-	.word	0x4000000
-	.word	0x8000000
-	.word	0x10000000
-	.word	0x20000000
-	.word	0x40000000
-	.word	0x80000000
-	.word	0x1b000000
-	.word	0x36000000
+	.long	0x01000000,0x02000000,0x04000000,0x08000000
+	.long	0x10000000,0x20000000,0x40000000,0x80000000
+	.long	0x1b000000,0x36000000
 	.text
 	.align	4
 	.globl	AES_set_encrypt_key
@@ -9698,12 +9352,21 @@ L_AES_encrypt_block_nr:
 #if defined(HAVE_AESCCM) || defined(HAVE_AESGCM) || \
     defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) || \
     defined(HAVE_AES_ECB)
+#ifndef __APPLE__
 	.text
 	.type	L_AES_ARM32_te_ecb, %object
 	.size	L_AES_ARM32_te_ecb, 12
-	.align	4
+#else
+	.section	__DATA,__data
+#endif /* __APPLE__ */
+	# 8-byte aligned, 64-bit aligned
+#ifndef __APPLE__
+	.align	3
+#else
+	.p2align	3
+#endif /* __APPLE__ */
 L_AES_ARM32_te_ecb:
-	.word	L_AES_ARM32_te_data
+	.long	L_AES_ARM32_te_data
 	.text
 	.align	4
 	.globl	AES_ECB_encrypt
@@ -11854,12 +11517,21 @@ L_AES_ECB_encrypt_end:
 #endif /* HAVE_AESCCM || HAVE_AESGCM || WOLFSSL_AES_DIRECT ||
         * WOLFSSL_AES_COUNTER || HAVE_AES_ECB */
 #ifdef HAVE_AES_CBC
+#ifndef __APPLE__
 	.text
 	.type	L_AES_ARM32_te_cbc, %object
 	.size	L_AES_ARM32_te_cbc, 12
-	.align	4
+#else
+	.section	__DATA,__data
+#endif /* __APPLE__ */
+	# 8-byte aligned, 64-bit aligned
+#ifndef __APPLE__
+	.align	3
+#else
+	.p2align	3
+#endif /* __APPLE__ */
 L_AES_ARM32_te_cbc:
-	.word	L_AES_ARM32_te_data
+	.long	L_AES_ARM32_te_data
 	.text
 	.align	4
 	.globl	AES_CBC_encrypt
@@ -14024,12 +13696,21 @@ L_AES_CBC_encrypt_end:
 	.size	AES_CBC_encrypt,.-AES_CBC_encrypt
 #endif /* HAVE_AES_CBC */
 #ifdef WOLFSSL_AES_COUNTER
+#ifndef __APPLE__
 	.text
 	.type	L_AES_ARM32_te_ctr, %object
 	.size	L_AES_ARM32_te_ctr, 12
-	.align	4
+#else
+	.section	__DATA,__data
+#endif /* __APPLE__ */
+	# 8-byte aligned, 64-bit aligned
+#ifndef __APPLE__
+	.align	3
+#else
+	.p2align	3
+#endif /* __APPLE__ */
 L_AES_ARM32_te_ctr:
-	.word	L_AES_ARM32_te_data
+	.long	L_AES_ARM32_te_data
 	.text
 	.align	4
 	.globl	AES_CTR_encrypt
@@ -16843,274 +16524,68 @@ L_AES_decrypt_block_nr:
 	pop	{pc}
 	.size	AES_decrypt_block,.-AES_decrypt_block
 #endif /* !WOLFSSL_ARMASM_AES_BLOCK_INLINE */
+#ifndef __APPLE__
 	.text
 	.type	L_AES_ARM32_td_ecb, %object
 	.size	L_AES_ARM32_td_ecb, 12
-	.align	4
+#else
+	.section	__DATA,__data
+#endif /* __APPLE__ */
+	# 8-byte aligned, 64-bit aligned
+#ifndef __APPLE__
+	.align	3
+#else
+	.p2align	3
+#endif /* __APPLE__ */
 L_AES_ARM32_td_ecb:
-	.word	L_AES_ARM32_td_data
+	.long	L_AES_ARM32_td_data
 #if defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) || defined(HAVE_AES_ECB)
+#ifndef __APPLE__
 	.text
 	.type	L_AES_ARM32_ecb_td4, %object
 	.size	L_AES_ARM32_ecb_td4, 256
-	.align	4
+#else
+	.section	__DATA,__data
+#endif /* __APPLE__ */
+	# 4-byte aligned, 32-bit aligned
+#ifndef __APPLE__
+	.align	2
+#else
+	.p2align	2
+#endif /* __APPLE__ */
 L_AES_ARM32_ecb_td4:
-	.byte	0x52
-	.byte	0x9
-	.byte	0x6a
-	.byte	0xd5
-	.byte	0x30
-	.byte	0x36
-	.byte	0xa5
-	.byte	0x38
-	.byte	0xbf
-	.byte	0x40
-	.byte	0xa3
-	.byte	0x9e
-	.byte	0x81
-	.byte	0xf3
-	.byte	0xd7
-	.byte	0xfb
-	.byte	0x7c
-	.byte	0xe3
-	.byte	0x39
-	.byte	0x82
-	.byte	0x9b
-	.byte	0x2f
-	.byte	0xff
-	.byte	0x87
-	.byte	0x34
-	.byte	0x8e
-	.byte	0x43
-	.byte	0x44
-	.byte	0xc4
-	.byte	0xde
-	.byte	0xe9
-	.byte	0xcb
-	.byte	0x54
-	.byte	0x7b
-	.byte	0x94
-	.byte	0x32
-	.byte	0xa6
-	.byte	0xc2
-	.byte	0x23
-	.byte	0x3d
-	.byte	0xee
-	.byte	0x4c
-	.byte	0x95
-	.byte	0xb
-	.byte	0x42
-	.byte	0xfa
-	.byte	0xc3
-	.byte	0x4e
-	.byte	0x8
-	.byte	0x2e
-	.byte	0xa1
-	.byte	0x66
-	.byte	0x28
-	.byte	0xd9
-	.byte	0x24
-	.byte	0xb2
-	.byte	0x76
-	.byte	0x5b
-	.byte	0xa2
-	.byte	0x49
-	.byte	0x6d
-	.byte	0x8b
-	.byte	0xd1
-	.byte	0x25
-	.byte	0x72
-	.byte	0xf8
-	.byte	0xf6
-	.byte	0x64
-	.byte	0x86
-	.byte	0x68
-	.byte	0x98
-	.byte	0x16
-	.byte	0xd4
-	.byte	0xa4
-	.byte	0x5c
-	.byte	0xcc
-	.byte	0x5d
-	.byte	0x65
-	.byte	0xb6
-	.byte	0x92
-	.byte	0x6c
-	.byte	0x70
-	.byte	0x48
-	.byte	0x50
-	.byte	0xfd
-	.byte	0xed
-	.byte	0xb9
-	.byte	0xda
-	.byte	0x5e
-	.byte	0x15
-	.byte	0x46
-	.byte	0x57
-	.byte	0xa7
-	.byte	0x8d
-	.byte	0x9d
-	.byte	0x84
-	.byte	0x90
-	.byte	0xd8
-	.byte	0xab
-	.byte	0x0
-	.byte	0x8c
-	.byte	0xbc
-	.byte	0xd3
-	.byte	0xa
-	.byte	0xf7
-	.byte	0xe4
-	.byte	0x58
-	.byte	0x5
-	.byte	0xb8
-	.byte	0xb3
-	.byte	0x45
-	.byte	0x6
-	.byte	0xd0
-	.byte	0x2c
-	.byte	0x1e
-	.byte	0x8f
-	.byte	0xca
-	.byte	0x3f
-	.byte	0xf
-	.byte	0x2
-	.byte	0xc1
-	.byte	0xaf
-	.byte	0xbd
-	.byte	0x3
-	.byte	0x1
-	.byte	0x13
-	.byte	0x8a
-	.byte	0x6b
-	.byte	0x3a
-	.byte	0x91
-	.byte	0x11
-	.byte	0x41
-	.byte	0x4f
-	.byte	0x67
-	.byte	0xdc
-	.byte	0xea
-	.byte	0x97
-	.byte	0xf2
-	.byte	0xcf
-	.byte	0xce
-	.byte	0xf0
-	.byte	0xb4
-	.byte	0xe6
-	.byte	0x73
-	.byte	0x96
-	.byte	0xac
-	.byte	0x74
-	.byte	0x22
-	.byte	0xe7
-	.byte	0xad
-	.byte	0x35
-	.byte	0x85
-	.byte	0xe2
-	.byte	0xf9
-	.byte	0x37
-	.byte	0xe8
-	.byte	0x1c
-	.byte	0x75
-	.byte	0xdf
-	.byte	0x6e
-	.byte	0x47
-	.byte	0xf1
-	.byte	0x1a
-	.byte	0x71
-	.byte	0x1d
-	.byte	0x29
-	.byte	0xc5
-	.byte	0x89
-	.byte	0x6f
-	.byte	0xb7
-	.byte	0x62
-	.byte	0xe
-	.byte	0xaa
-	.byte	0x18
-	.byte	0xbe
-	.byte	0x1b
-	.byte	0xfc
-	.byte	0x56
-	.byte	0x3e
-	.byte	0x4b
-	.byte	0xc6
-	.byte	0xd2
-	.byte	0x79
-	.byte	0x20
-	.byte	0x9a
-	.byte	0xdb
-	.byte	0xc0
-	.byte	0xfe
-	.byte	0x78
-	.byte	0xcd
-	.byte	0x5a
-	.byte	0xf4
-	.byte	0x1f
-	.byte	0xdd
-	.byte	0xa8
-	.byte	0x33
-	.byte	0x88
-	.byte	0x7
-	.byte	0xc7
-	.byte	0x31
-	.byte	0xb1
-	.byte	0x12
-	.byte	0x10
-	.byte	0x59
-	.byte	0x27
-	.byte	0x80
-	.byte	0xec
-	.byte	0x5f
-	.byte	0x60
-	.byte	0x51
-	.byte	0x7f
-	.byte	0xa9
-	.byte	0x19
-	.byte	0xb5
-	.byte	0x4a
-	.byte	0xd
-	.byte	0x2d
-	.byte	0xe5
-	.byte	0x7a
-	.byte	0x9f
-	.byte	0x93
-	.byte	0xc9
-	.byte	0x9c
-	.byte	0xef
-	.byte	0xa0
-	.byte	0xe0
-	.byte	0x3b
-	.byte	0x4d
-	.byte	0xae
-	.byte	0x2a
-	.byte	0xf5
-	.byte	0xb0
-	.byte	0xc8
-	.byte	0xeb
-	.byte	0xbb
-	.byte	0x3c
-	.byte	0x83
-	.byte	0x53
-	.byte	0x99
-	.byte	0x61
-	.byte	0x17
-	.byte	0x2b
-	.byte	0x4
-	.byte	0x7e
-	.byte	0xba
-	.byte	0x77
-	.byte	0xd6
-	.byte	0x26
-	.byte	0xe1
-	.byte	0x69
-	.byte	0x14
-	.byte	0x63
-	.byte	0x55
-	.byte	0x21
-	.byte	0xc
-	.byte	0x7d
+	.byte	0x52,0x09,0x6a,0xd5,0x30,0x36,0xa5,0x38
+	.byte	0xbf,0x40,0xa3,0x9e,0x81,0xf3,0xd7,0xfb
+	.byte	0x7c,0xe3,0x39,0x82,0x9b,0x2f,0xff,0x87
+	.byte	0x34,0x8e,0x43,0x44,0xc4,0xde,0xe9,0xcb
+	.byte	0x54,0x7b,0x94,0x32,0xa6,0xc2,0x23,0x3d
+	.byte	0xee,0x4c,0x95,0x0b,0x42,0xfa,0xc3,0x4e
+	.byte	0x08,0x2e,0xa1,0x66,0x28,0xd9,0x24,0xb2
+	.byte	0x76,0x5b,0xa2,0x49,0x6d,0x8b,0xd1,0x25
+	.byte	0x72,0xf8,0xf6,0x64,0x86,0x68,0x98,0x16
+	.byte	0xd4,0xa4,0x5c,0xcc,0x5d,0x65,0xb6,0x92
+	.byte	0x6c,0x70,0x48,0x50,0xfd,0xed,0xb9,0xda
+	.byte	0x5e,0x15,0x46,0x57,0xa7,0x8d,0x9d,0x84
+	.byte	0x90,0xd8,0xab,0x00,0x8c,0xbc,0xd3,0x0a
+	.byte	0xf7,0xe4,0x58,0x05,0xb8,0xb3,0x45,0x06
+	.byte	0xd0,0x2c,0x1e,0x8f,0xca,0x3f,0x0f,0x02
+	.byte	0xc1,0xaf,0xbd,0x03,0x01,0x13,0x8a,0x6b
+	.byte	0x3a,0x91,0x11,0x41,0x4f,0x67,0xdc,0xea
+	.byte	0x97,0xf2,0xcf,0xce,0xf0,0xb4,0xe6,0x73
+	.byte	0x96,0xac,0x74,0x22,0xe7,0xad,0x35,0x85
+	.byte	0xe2,0xf9,0x37,0xe8,0x1c,0x75,0xdf,0x6e
+	.byte	0x47,0xf1,0x1a,0x71,0x1d,0x29,0xc5,0x89
+	.byte	0x6f,0xb7,0x62,0x0e,0xaa,0x18,0xbe,0x1b
+	.byte	0xfc,0x56,0x3e,0x4b,0xc6,0xd2,0x79,0x20
+	.byte	0x9a,0xdb,0xc0,0xfe,0x78,0xcd,0x5a,0xf4
+	.byte	0x1f,0xdd,0xa8,0x33,0x88,0x07,0xc7,0x31
+	.byte	0xb1,0x12,0x10,0x59,0x27,0x80,0xec,0x5f
+	.byte	0x60,0x51,0x7f,0xa9,0x19,0xb5,0x4a,0x0d
+	.byte	0x2d,0xe5,0x7a,0x9f,0x93,0xc9,0x9c,0xef
+	.byte	0xa0,0xe0,0x3b,0x4d,0xae,0x2a,0xf5,0xb0
+	.byte	0xc8,0xeb,0xbb,0x3c,0x83,0x53,0x99,0x61
+	.byte	0x17,0x2b,0x04,0x7e,0xba,0x77,0xd6,0x26
+	.byte	0xe1,0x69,0x14,0x63,0x55,0x21,0x0c,0x7d
 	.text
 	.align	4
 	.globl	AES_ECB_decrypt
@@ -19257,267 +18732,52 @@ L_AES_ECB_decrypt_end:
 	.size	AES_ECB_decrypt,.-AES_ECB_decrypt
 #endif /* WOLFSSL_AES_DIRECT || WOLFSSL_AES_COUNTER || defined(HAVE_AES_ECB) */
 #ifdef HAVE_AES_CBC
+#ifndef __APPLE__
 	.text
 	.type	L_AES_ARM32_cbc_td4, %object
 	.size	L_AES_ARM32_cbc_td4, 256
-	.align	4
+#else
+	.section	__DATA,__data
+#endif /* __APPLE__ */
+	# 4-byte aligned, 32-bit aligned
+#ifndef __APPLE__
+	.align	2
+#else
+	.p2align	2
+#endif /* __APPLE__ */
 L_AES_ARM32_cbc_td4:
-	.byte	0x52
-	.byte	0x9
-	.byte	0x6a
-	.byte	0xd5
-	.byte	0x30
-	.byte	0x36
-	.byte	0xa5
-	.byte	0x38
-	.byte	0xbf
-	.byte	0x40
-	.byte	0xa3
-	.byte	0x9e
-	.byte	0x81
-	.byte	0xf3
-	.byte	0xd7
-	.byte	0xfb
-	.byte	0x7c
-	.byte	0xe3
-	.byte	0x39
-	.byte	0x82
-	.byte	0x9b
-	.byte	0x2f
-	.byte	0xff
-	.byte	0x87
-	.byte	0x34
-	.byte	0x8e
-	.byte	0x43
-	.byte	0x44
-	.byte	0xc4
-	.byte	0xde
-	.byte	0xe9
-	.byte	0xcb
-	.byte	0x54
-	.byte	0x7b
-	.byte	0x94
-	.byte	0x32
-	.byte	0xa6
-	.byte	0xc2
-	.byte	0x23
-	.byte	0x3d
-	.byte	0xee
-	.byte	0x4c
-	.byte	0x95
-	.byte	0xb
-	.byte	0x42
-	.byte	0xfa
-	.byte	0xc3
-	.byte	0x4e
-	.byte	0x8
-	.byte	0x2e
-	.byte	0xa1
-	.byte	0x66
-	.byte	0x28
-	.byte	0xd9
-	.byte	0x24
-	.byte	0xb2
-	.byte	0x76
-	.byte	0x5b
-	.byte	0xa2
-	.byte	0x49
-	.byte	0x6d
-	.byte	0x8b
-	.byte	0xd1
-	.byte	0x25
-	.byte	0x72
-	.byte	0xf8
-	.byte	0xf6
-	.byte	0x64
-	.byte	0x86
-	.byte	0x68
-	.byte	0x98
-	.byte	0x16
-	.byte	0xd4
-	.byte	0xa4
-	.byte	0x5c
-	.byte	0xcc
-	.byte	0x5d
-	.byte	0x65
-	.byte	0xb6
-	.byte	0x92
-	.byte	0x6c
-	.byte	0x70
-	.byte	0x48
-	.byte	0x50
-	.byte	0xfd
-	.byte	0xed
-	.byte	0xb9
-	.byte	0xda
-	.byte	0x5e
-	.byte	0x15
-	.byte	0x46
-	.byte	0x57
-	.byte	0xa7
-	.byte	0x8d
-	.byte	0x9d
-	.byte	0x84
-	.byte	0x90
-	.byte	0xd8
-	.byte	0xab
-	.byte	0x0
-	.byte	0x8c
-	.byte	0xbc
-	.byte	0xd3
-	.byte	0xa
-	.byte	0xf7
-	.byte	0xe4
-	.byte	0x58
-	.byte	0x5
-	.byte	0xb8
-	.byte	0xb3
-	.byte	0x45
-	.byte	0x6
-	.byte	0xd0
-	.byte	0x2c
-	.byte	0x1e
-	.byte	0x8f
-	.byte	0xca
-	.byte	0x3f
-	.byte	0xf
-	.byte	0x2
-	.byte	0xc1
-	.byte	0xaf
-	.byte	0xbd
-	.byte	0x3
-	.byte	0x1
-	.byte	0x13
-	.byte	0x8a
-	.byte	0x6b
-	.byte	0x3a
-	.byte	0x91
-	.byte	0x11
-	.byte	0x41
-	.byte	0x4f
-	.byte	0x67
-	.byte	0xdc
-	.byte	0xea
-	.byte	0x97
-	.byte	0xf2
-	.byte	0xcf
-	.byte	0xce
-	.byte	0xf0
-	.byte	0xb4
-	.byte	0xe6
-	.byte	0x73
-	.byte	0x96
-	.byte	0xac
-	.byte	0x74
-	.byte	0x22
-	.byte	0xe7
-	.byte	0xad
-	.byte	0x35
-	.byte	0x85
-	.byte	0xe2
-	.byte	0xf9
-	.byte	0x37
-	.byte	0xe8
-	.byte	0x1c
-	.byte	0x75
-	.byte	0xdf
-	.byte	0x6e
-	.byte	0x47
-	.byte	0xf1
-	.byte	0x1a
-	.byte	0x71
-	.byte	0x1d
-	.byte	0x29
-	.byte	0xc5
-	.byte	0x89
-	.byte	0x6f
-	.byte	0xb7
-	.byte	0x62
-	.byte	0xe
-	.byte	0xaa
-	.byte	0x18
-	.byte	0xbe
-	.byte	0x1b
-	.byte	0xfc
-	.byte	0x56
-	.byte	0x3e
-	.byte	0x4b
-	.byte	0xc6
-	.byte	0xd2
-	.byte	0x79
-	.byte	0x20
-	.byte	0x9a
-	.byte	0xdb
-	.byte	0xc0
-	.byte	0xfe
-	.byte	0x78
-	.byte	0xcd
-	.byte	0x5a
-	.byte	0xf4
-	.byte	0x1f
-	.byte	0xdd
-	.byte	0xa8
-	.byte	0x33
-	.byte	0x88
-	.byte	0x7
-	.byte	0xc7
-	.byte	0x31
-	.byte	0xb1
-	.byte	0x12
-	.byte	0x10
-	.byte	0x59
-	.byte	0x27
-	.byte	0x80
-	.byte	0xec
-	.byte	0x5f
-	.byte	0x60
-	.byte	0x51
-	.byte	0x7f
-	.byte	0xa9
-	.byte	0x19
-	.byte	0xb5
-	.byte	0x4a
-	.byte	0xd
-	.byte	0x2d
-	.byte	0xe5
-	.byte	0x7a
-	.byte	0x9f
-	.byte	0x93
-	.byte	0xc9
-	.byte	0x9c
-	.byte	0xef
-	.byte	0xa0
-	.byte	0xe0
-	.byte	0x3b
-	.byte	0x4d
-	.byte	0xae
-	.byte	0x2a
-	.byte	0xf5
-	.byte	0xb0
-	.byte	0xc8
-	.byte	0xeb
-	.byte	0xbb
-	.byte	0x3c
-	.byte	0x83
-	.byte	0x53
-	.byte	0x99
-	.byte	0x61
-	.byte	0x17
-	.byte	0x2b
-	.byte	0x4
-	.byte	0x7e
-	.byte	0xba
-	.byte	0x77
-	.byte	0xd6
-	.byte	0x26
-	.byte	0xe1
-	.byte	0x69
-	.byte	0x14
-	.byte	0x63
-	.byte	0x55
-	.byte	0x21
-	.byte	0xc
-	.byte	0x7d
+	.byte	0x52,0x09,0x6a,0xd5,0x30,0x36,0xa5,0x38
+	.byte	0xbf,0x40,0xa3,0x9e,0x81,0xf3,0xd7,0xfb
+	.byte	0x7c,0xe3,0x39,0x82,0x9b,0x2f,0xff,0x87
+	.byte	0x34,0x8e,0x43,0x44,0xc4,0xde,0xe9,0xcb
+	.byte	0x54,0x7b,0x94,0x32,0xa6,0xc2,0x23,0x3d
+	.byte	0xee,0x4c,0x95,0x0b,0x42,0xfa,0xc3,0x4e
+	.byte	0x08,0x2e,0xa1,0x66,0x28,0xd9,0x24,0xb2
+	.byte	0x76,0x5b,0xa2,0x49,0x6d,0x8b,0xd1,0x25
+	.byte	0x72,0xf8,0xf6,0x64,0x86,0x68,0x98,0x16
+	.byte	0xd4,0xa4,0x5c,0xcc,0x5d,0x65,0xb6,0x92
+	.byte	0x6c,0x70,0x48,0x50,0xfd,0xed,0xb9,0xda
+	.byte	0x5e,0x15,0x46,0x57,0xa7,0x8d,0x9d,0x84
+	.byte	0x90,0xd8,0xab,0x00,0x8c,0xbc,0xd3,0x0a
+	.byte	0xf7,0xe4,0x58,0x05,0xb8,0xb3,0x45,0x06
+	.byte	0xd0,0x2c,0x1e,0x8f,0xca,0x3f,0x0f,0x02
+	.byte	0xc1,0xaf,0xbd,0x03,0x01,0x13,0x8a,0x6b
+	.byte	0x3a,0x91,0x11,0x41,0x4f,0x67,0xdc,0xea
+	.byte	0x97,0xf2,0xcf,0xce,0xf0,0xb4,0xe6,0x73
+	.byte	0x96,0xac,0x74,0x22,0xe7,0xad,0x35,0x85
+	.byte	0xe2,0xf9,0x37,0xe8,0x1c,0x75,0xdf,0x6e
+	.byte	0x47,0xf1,0x1a,0x71,0x1d,0x29,0xc5,0x89
+	.byte	0x6f,0xb7,0x62,0x0e,0xaa,0x18,0xbe,0x1b
+	.byte	0xfc,0x56,0x3e,0x4b,0xc6,0xd2,0x79,0x20
+	.byte	0x9a,0xdb,0xc0,0xfe,0x78,0xcd,0x5a,0xf4
+	.byte	0x1f,0xdd,0xa8,0x33,0x88,0x07,0xc7,0x31
+	.byte	0xb1,0x12,0x10,0x59,0x27,0x80,0xec,0x5f
+	.byte	0x60,0x51,0x7f,0xa9,0x19,0xb5,0x4a,0x0d
+	.byte	0x2d,0xe5,0x7a,0x9f,0x93,0xc9,0x9c,0xef
+	.byte	0xa0,0xe0,0x3b,0x4d,0xae,0x2a,0xf5,0xb0
+	.byte	0xc8,0xeb,0xbb,0x3c,0x83,0x53,0x99,0x61
+	.byte	0x17,0x2b,0x04,0x7e,0xba,0x77,0xd6,0x26
+	.byte	0xe1,0x69,0x14,0x63,0x55,0x21,0x0c,0x7d
 	.text
 	.align	4
 	.globl	AES_CBC_decrypt
@@ -19531,7 +18791,7 @@ AES_CBC_decrypt:
 	adr	r2, L_AES_ARM32_cbc_td4
 	ldr	r8, [sp, #36]
 	ldr	r4, [sp, #40]
-	push	{r3, r4}
+	push	{r3-r4}
 	cmp	r8, #10
 	beq	L_AES_CBC_decrypt_loop_block_128
 	cmp	r8, #12
@@ -23954,7 +23214,7 @@ L_AES_CBC_decrypt_end_odd:
 	strd	r10, r11, [r4, #8]
 #endif
 L_AES_CBC_decrypt_end:
-	pop	{r3, r4}
+	pop	{r3-r4}
 	pop	{r4, r5, r6, r7, r8, r9, r10, r11, pc}
 	.size	AES_CBC_decrypt,.-AES_CBC_decrypt
 #endif /* HAVE_AES_CBC */
@@ -23962,27 +23222,24 @@ L_AES_CBC_decrypt_end:
         * HAVE_AES_ECB */
 #endif /* HAVE_AES_DECRYPT */
 #ifdef HAVE_AESGCM
+#ifndef __APPLE__
 	.text
 	.type	L_GCM_gmult_len_r, %object
 	.size	L_GCM_gmult_len_r, 64
-	.align	4
+#else
+	.section	__DATA,__data
+#endif /* __APPLE__ */
+	# 8-byte aligned, 64-bit aligned
+#ifndef __APPLE__
+	.align	3
+#else
+	.p2align	3
+#endif /* __APPLE__ */
 L_GCM_gmult_len_r:
-	.word	0x0
-	.word	0x1c200000
-	.word	0x38400000
-	.word	0x24600000
-	.word	0x70800000
-	.word	0x6ca00000
-	.word	0x48c00000
-	.word	0x54e00000
-	.word	0xe1000000
-	.word	0xfd200000
-	.word	0xd9400000
-	.word	0xc5600000
-	.word	0x91800000
-	.word	0x8da00000
-	.word	0xa9c00000
-	.word	0xb5e00000
+	.long	0x00000000,0x1c200000,0x38400000,0x24600000
+	.long	0x70800000,0x6ca00000,0x48c00000,0x54e00000
+	.long	0xe1000000,0xfd200000,0xd9400000,0xc5600000
+	.long	0x91800000,0x8da00000,0xa9c00000,0xb5e00000
 	.text
 	.align	4
 	.globl	GCM_gmult_len
@@ -24561,12 +23818,21 @@ L_GCM_gmult_len_start_block:
 	bne	L_GCM_gmult_len_start_block
 	pop	{r4, r5, r6, r7, r8, r9, r10, r11, pc}
 	.size	GCM_gmult_len,.-GCM_gmult_len
+#ifndef __APPLE__
 	.text
 	.type	L_AES_ARM32_te_gcm, %object
 	.size	L_AES_ARM32_te_gcm, 12
-	.align	4
+#else
+	.section	__DATA,__data
+#endif /* __APPLE__ */
+	# 8-byte aligned, 64-bit aligned
+#ifndef __APPLE__
+	.align	3
+#else
+	.p2align	3
+#endif /* __APPLE__ */
 L_AES_ARM32_te_gcm:
-	.word	L_AES_ARM32_te_data
+	.long	L_AES_ARM32_te_data
 	.text
 	.align	4
 	.globl	AES_GCM_encrypt
diff --git a/wolfcrypt/src/port/arm/armv8-32-aes-asm_c.c b/wolfcrypt/src/port/arm/armv8-32-aes-asm_c.c
index 360e0fc5981..52adcfc5f24 100644
--- a/wolfcrypt/src/port/arm/armv8-32-aes-asm_c.c
+++ b/wolfcrypt/src/port/arm/armv8-32-aes-asm_c.c
@@ -30,8 +30,6 @@
 
 #ifdef WOLFSSL_ARMASM
 #if !defined(__aarch64__) && !defined(WOLFSSL_ARMASM_THUMB2)
-#include <stdint.h>
-#include <wolfssl/wolfcrypt/libwolfssl_sources.h>
 #ifdef WOLFSSL_ARMASM_INLINE
 
 #ifdef __IAR_SYSTEMS_ICC__
@@ -59,13 +57,13 @@ WC_OMIT_FRAME_POINTER void AES_set_key_AARCH32(const byte* userKey_p,
 #else
 WC_OMIT_FRAME_POINTER void AES_set_key_AARCH32(const byte* userKey, int keylen,
     byte* key, int dir)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register const byte* userKey asm ("r0") = (const byte*)userKey_p;
-    register int keylen asm ("r1") = (int)keylen_p;
-    register byte* key asm ("r2") = (byte*)key_p;
-    register int dir asm ("r3") = (int)dir_p;
+    register const byte* userKey __asm__ ("r0") = (const byte*)userKey_p;
+    register int keylen __asm__ ("r1") = (int)keylen_p;
+    register byte* key __asm__ ("r2") = (byte*)key_p;
+    register int dir __asm__ ("r3") = (int)dir_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -368,7 +366,7 @@ WC_OMIT_FRAME_POINTER void AES_set_key_AARCH32(const byte* userKey, int keylen,
         "vst1.32	{q0}, [%[key]]\n\t"
         "b	L_aes_set_key_arm32_crypto_done_%=\n\t"
         "\n"
-    "L_aes_set_key_arm32_crypto_start_256_%=: \n\t"
+    "L_aes_set_key_arm32_crypto_start_256_%=:\n\t"
         "ldr	r4, [%[userKey]], #4\n\t"
         "ldr	r5, [%[userKey]], #4\n\t"
         "ldr	r6, [%[userKey]], #4\n\t"
@@ -721,7 +719,7 @@ WC_OMIT_FRAME_POINTER void AES_set_key_AARCH32(const byte* userKey, int keylen,
         "vst1.32	{q0}, [%[key]]\n\t"
         "b	L_aes_set_key_arm32_crypto_done_%=\n\t"
         "\n"
-    "L_aes_set_key_arm32_crypto_start_128_%=: \n\t"
+    "L_aes_set_key_arm32_crypto_start_128_%=:\n\t"
         "ldr	r4, [%[userKey]], #4\n\t"
         "ldr	r5, [%[userKey]], #4\n\t"
         "ldr	r6, [%[userKey]], #4\n\t"
@@ -993,7 +991,7 @@ WC_OMIT_FRAME_POINTER void AES_set_key_AARCH32(const byte* userKey, int keylen,
         "aesimc.8	q0, q0\n\t"
         "vst1.32	{q0}, [%[key]]\n\t"
         "\n"
-    "L_aes_set_key_arm32_crypto_done_%=: \n\t"
+    "L_aes_set_key_arm32_crypto_done_%=:\n\t"
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [userKey] "+r" (userKey), [keylen] "+r" (keylen), [key] "+r" (key),
           [dir] "+r" (dir)
@@ -1017,13 +1015,13 @@ WC_OMIT_FRAME_POINTER void AES_encrypt_AARCH32(const byte* inBlock_p,
 #else
 WC_OMIT_FRAME_POINTER void AES_encrypt_AARCH32(const byte* inBlock,
     byte* outBlock, byte* key, int nr)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register const byte* inBlock asm ("r0") = (const byte*)inBlock_p;
-    register byte* outBlock asm ("r1") = (byte*)outBlock_p;
-    register byte* key asm ("r2") = (byte*)key_p;
-    register int nr asm ("r3") = (int)nr_p;
+    register const byte* inBlock __asm__ ("r0") = (const byte*)inBlock_p;
+    register byte* outBlock __asm__ ("r1") = (byte*)outBlock_p;
+    register byte* key __asm__ ("r2") = (byte*)key_p;
+    register int nr __asm__ ("r3") = (int)nr_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -1065,7 +1063,7 @@ WC_OMIT_FRAME_POINTER void AES_encrypt_AARCH32(const byte* inBlock,
         "aesmc.8	q0, q0\n\t"
         "aese.8	q0, q2\n\t"
         "\n"
-    "L_aes_encrypt_arm32_crypto_round_done_%=: \n\t"
+    "L_aes_encrypt_arm32_crypto_round_done_%=:\n\t"
         "vld1.32	{q1}, [%[key]]\n\t"
         "veor.32	q0, q0, q1\n\t"
         "vst1.8	{q0}, [%[outBlock]]\n\t"
@@ -1094,13 +1092,13 @@ WC_OMIT_FRAME_POINTER void AES_decrypt_AARCH32(const byte* inBlock_p,
 #else
 WC_OMIT_FRAME_POINTER void AES_decrypt_AARCH32(const byte* inBlock,
     byte* outBlock, byte* key, int nr)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register const byte* inBlock asm ("r0") = (const byte*)inBlock_p;
-    register byte* outBlock asm ("r1") = (byte*)outBlock_p;
-    register byte* key asm ("r2") = (byte*)key_p;
-    register int nr asm ("r3") = (int)nr_p;
+    register const byte* inBlock __asm__ ("r0") = (const byte*)inBlock_p;
+    register byte* outBlock __asm__ ("r1") = (byte*)outBlock_p;
+    register byte* key __asm__ ("r2") = (byte*)key_p;
+    register int nr __asm__ ("r3") = (int)nr_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -1142,7 +1140,7 @@ WC_OMIT_FRAME_POINTER void AES_decrypt_AARCH32(const byte* inBlock,
         "aesimc.8	q0, q0\n\t"
         "aesd.8	q0, q2\n\t"
         "\n"
-    "L_aes_decrypt_arm32_crypto_round_done_%=: \n\t"
+    "L_aes_decrypt_arm32_crypto_round_done_%=:\n\t"
         "vld1.32	{q1}, [%[key]]\n\t"
         "veor.32	q0, q0, q1\n\t"
         "vst1.8	{q0}, [%[outBlock]]\n\t"
@@ -1169,14 +1167,14 @@ WC_OMIT_FRAME_POINTER void AES_encrypt_blocks_AARCH32(const byte* in_p,
 #else
 WC_OMIT_FRAME_POINTER void AES_encrypt_blocks_AARCH32(const byte* in, byte* out,
     word32 sz, byte* key, int nr)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register const byte* in asm ("r0") = (const byte*)in_p;
-    register byte* out asm ("r1") = (byte*)out_p;
-    register word32 sz asm ("r2") = (word32)sz_p;
-    register byte* key asm ("r3") = (byte*)key_p;
-    register int nr asm ("r12") = (int)nr_p;
+    register const byte* in __asm__ ("r0") = (const byte*)in_p;
+    register byte* out __asm__ ("r1") = (byte*)out_p;
+    register word32 sz __asm__ ("r2") = (word32)sz_p;
+    register byte* key __asm__ ("r3") = (byte*)key_p;
+    register int nr __asm__ ("r12") = (int)nr_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -1193,7 +1191,7 @@ WC_OMIT_FRAME_POINTER void AES_encrypt_blocks_AARCH32(const byte* in, byte* out,
         "cmp	%[sz], #1\n\t"
         "beq	L_aes_encrypt_blocks_arm32_crypto_192_start_1_%=\n\t"
         "\n"
-    "L_aes_encrypt_blocks_arm32_crypto_192_start_4_%=: \n\t"
+    "L_aes_encrypt_blocks_arm32_crypto_192_start_4_%=:\n\t"
         "cmp	%[sz], #4\n\t"
         "blt	L_aes_encrypt_blocks_arm32_crypto_192_start_2_%=\n\t"
         "vldm.8	%[in]!, {q12-q15}\n\t"
@@ -1302,7 +1300,7 @@ WC_OMIT_FRAME_POINTER void AES_encrypt_blocks_AARCH32(const byte* in, byte* out,
         "cmp	%[sz], #4\n\t"
         "bge	L_aes_encrypt_blocks_arm32_crypto_192_start_4_%=\n\t"
         "\n"
-    "L_aes_encrypt_blocks_arm32_crypto_192_start_2_%=: \n\t"
+    "L_aes_encrypt_blocks_arm32_crypto_192_start_2_%=:\n\t"
         "cmp	%[sz], #2\n\t"
         "blt	L_aes_encrypt_blocks_arm32_crypto_192_start_1_%=\n\t"
         "vld1.8	{q12-q13}, [%[in]]!\n\t"
@@ -1361,7 +1359,7 @@ WC_OMIT_FRAME_POINTER void AES_encrypt_blocks_AARCH32(const byte* in, byte* out,
         "sub	%[sz], %[sz], #2\n\t"
         "vst1.8	{q12-q13}, [%[out]]!\n\t"
         "\n"
-    "L_aes_encrypt_blocks_arm32_crypto_192_start_1_%=: \n\t"
+    "L_aes_encrypt_blocks_arm32_crypto_192_start_1_%=:\n\t"
         "cmp	%[sz], #0\n\t"
         "beq	L_aes_encrypt_blocks_arm32_crypto_192_done_%=\n\t"
         "vld1.8	{q12}, [%[in]]!\n\t"
@@ -1395,18 +1393,18 @@ WC_OMIT_FRAME_POINTER void AES_encrypt_blocks_AARCH32(const byte* in, byte* out,
         "sub	%[key], %[key], #48\n\t"
         "vst1.8	{q12}, [%[out]]!\n\t"
         "\n"
-    "L_aes_encrypt_blocks_arm32_crypto_192_done_%=: \n\t"
+    "L_aes_encrypt_blocks_arm32_crypto_192_done_%=:\n\t"
 #endif /* !NO_AES_192 */
         "b	L_aes_encrypt_blocks_arm32_crypto_done_%=\n\t"
         /* AES_ECB_256 */
         "\n"
-    "L_aes_encrypt_blocks_arm32_crypto_start_256_%=: \n\t"
+    "L_aes_encrypt_blocks_arm32_crypto_start_256_%=:\n\t"
 #ifndef NO_AES_256
         "vld1.32	{q8-q9}, [%[key]]!\n\t"
         "cmp	%[sz], #1\n\t"
         "beq	L_aes_encrypt_blocks_arm32_crypto_256_start_1_%=\n\t"
         "\n"
-    "L_aes_encrypt_blocks_arm32_crypto_256_start_4_%=: \n\t"
+    "L_aes_encrypt_blocks_arm32_crypto_256_start_4_%=:\n\t"
         "cmp	%[sz], #4\n\t"
         "blt	L_aes_encrypt_blocks_arm32_crypto_256_start_2_%=\n\t"
         "vldm.8	%[in]!, {q12-q15}\n\t"
@@ -1533,7 +1531,7 @@ WC_OMIT_FRAME_POINTER void AES_encrypt_blocks_AARCH32(const byte* in, byte* out,
         "cmp	%[sz], #4\n\t"
         "bge	L_aes_encrypt_blocks_arm32_crypto_256_start_4_%=\n\t"
         "\n"
-    "L_aes_encrypt_blocks_arm32_crypto_256_start_2_%=: \n\t"
+    "L_aes_encrypt_blocks_arm32_crypto_256_start_2_%=:\n\t"
         "cmp	%[sz], #2\n\t"
         "blt	L_aes_encrypt_blocks_arm32_crypto_256_start_1_%=\n\t"
         "vld1.8	{q12-q13}, [%[in]]!\n\t"
@@ -1602,7 +1600,7 @@ WC_OMIT_FRAME_POINTER void AES_encrypt_blocks_AARCH32(const byte* in, byte* out,
         "sub	%[sz], %[sz], #2\n\t"
         "vst1.8	{q12-q13}, [%[out]]!\n\t"
         "\n"
-    "L_aes_encrypt_blocks_arm32_crypto_256_start_1_%=: \n\t"
+    "L_aes_encrypt_blocks_arm32_crypto_256_start_1_%=:\n\t"
         "cmp	%[sz], #0\n\t"
         "beq	L_aes_encrypt_blocks_arm32_crypto_256_done_%=\n\t"
         "vld1.8	{q12}, [%[in]]!\n\t"
@@ -1642,18 +1640,18 @@ WC_OMIT_FRAME_POINTER void AES_encrypt_blocks_AARCH32(const byte* in, byte* out,
         "sub	%[key], %[key], #0x50\n\t"
         "vst1.8	{q12}, [%[out]]!\n\t"
         "\n"
-    "L_aes_encrypt_blocks_arm32_crypto_256_done_%=: \n\t"
+    "L_aes_encrypt_blocks_arm32_crypto_256_done_%=:\n\t"
 #endif /* !NO_AES_256 */
         "b	L_aes_encrypt_blocks_arm32_crypto_done_%=\n\t"
         /* AES_ECB_128 */
         "\n"
-    "L_aes_encrypt_blocks_arm32_crypto_start_128_%=: \n\t"
+    "L_aes_encrypt_blocks_arm32_crypto_start_128_%=:\n\t"
 #ifndef NO_AES_128
         "vldm.32	%[key]!, {q8-q10}\n\t"
         "cmp	%[sz], #1\n\t"
         "beq	L_aes_encrypt_blocks_arm32_crypto_128_start_1_%=\n\t"
         "\n"
-    "L_aes_encrypt_blocks_arm32_crypto_128_start_4_%=: \n\t"
+    "L_aes_encrypt_blocks_arm32_crypto_128_start_4_%=:\n\t"
         "cmp	%[sz], #4\n\t"
         "blt	L_aes_encrypt_blocks_arm32_crypto_128_start_2_%=\n\t"
         "vldm.8	%[in]!, {q12-q15}\n\t"
@@ -1742,7 +1740,7 @@ WC_OMIT_FRAME_POINTER void AES_encrypt_blocks_AARCH32(const byte* in, byte* out,
         "cmp	%[sz], #4\n\t"
         "bge	L_aes_encrypt_blocks_arm32_crypto_128_start_4_%=\n\t"
         "\n"
-    "L_aes_encrypt_blocks_arm32_crypto_128_start_2_%=: \n\t"
+    "L_aes_encrypt_blocks_arm32_crypto_128_start_2_%=:\n\t"
         "cmp	%[sz], #2\n\t"
         "blt	L_aes_encrypt_blocks_arm32_crypto_128_start_1_%=\n\t"
         "vld1.8	{q12-q13}, [%[in]]!\n\t"
@@ -1789,7 +1787,7 @@ WC_OMIT_FRAME_POINTER void AES_encrypt_blocks_AARCH32(const byte* in, byte* out,
         "sub	%[sz], %[sz], #2\n\t"
         "vst1.8	{q12-q13}, [%[out]]!\n\t"
         "\n"
-    "L_aes_encrypt_blocks_arm32_crypto_128_start_1_%=: \n\t"
+    "L_aes_encrypt_blocks_arm32_crypto_128_start_1_%=:\n\t"
         "cmp	%[sz], #0\n\t"
         "beq	L_aes_encrypt_blocks_arm32_crypto_128_done_%=\n\t"
         "vld1.8	{q12}, [%[in]]!\n\t"
@@ -1815,10 +1813,10 @@ WC_OMIT_FRAME_POINTER void AES_encrypt_blocks_AARCH32(const byte* in, byte* out,
         "veor.32	q12, q12, q10\n\t"
         "vst1.8	{q12}, [%[out]]!\n\t"
         "\n"
-    "L_aes_encrypt_blocks_arm32_crypto_128_done_%=: \n\t"
+    "L_aes_encrypt_blocks_arm32_crypto_128_done_%=:\n\t"
 #endif /* !NO_AES_128 */
         "\n"
-    "L_aes_encrypt_blocks_arm32_crypto_done_%=: \n\t"
+    "L_aes_encrypt_blocks_arm32_crypto_done_%=:\n\t"
         "pop	{%[nr]}\n\t"
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [in] "+r" (in), [out] "+r" (out), [sz] "+r" (sz), [key] "+r" (key),
@@ -1841,14 +1839,14 @@ WC_OMIT_FRAME_POINTER void AES_decrypt_blocks_AARCH32(const byte* in_p,
 #else
 WC_OMIT_FRAME_POINTER void AES_decrypt_blocks_AARCH32(const byte* in, byte* out,
     word32 sz, byte* key, int nr)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register const byte* in asm ("r0") = (const byte*)in_p;
-    register byte* out asm ("r1") = (byte*)out_p;
-    register word32 sz asm ("r2") = (word32)sz_p;
-    register byte* key asm ("r3") = (byte*)key_p;
-    register int nr asm ("r12") = (int)nr_p;
+    register const byte* in __asm__ ("r0") = (const byte*)in_p;
+    register byte* out __asm__ ("r1") = (byte*)out_p;
+    register word32 sz __asm__ ("r2") = (word32)sz_p;
+    register byte* key __asm__ ("r3") = (byte*)key_p;
+    register int nr __asm__ ("r12") = (int)nr_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -1867,7 +1865,7 @@ WC_OMIT_FRAME_POINTER void AES_decrypt_blocks_AARCH32(const byte* in, byte* out,
         "cmp	%[sz], #4\n\t"
         "blt	L_aes_decrypt_blocks_arm32_crypto_192_start_2_%=\n\t"
         "\n"
-    "L_aes_decrypt_blocks_arm32_crypto_192_start_4_%=: \n\t"
+    "L_aes_decrypt_blocks_arm32_crypto_192_start_4_%=:\n\t"
         "vldm.8	%[in]!, {q12-q15}\n\t"
         "aesd.8	q12, q0\n\t"
         "aesimc.8	q12, q12\n\t"
@@ -1974,7 +1972,7 @@ WC_OMIT_FRAME_POINTER void AES_decrypt_blocks_AARCH32(const byte* in, byte* out,
         "cmp	%[sz], #4\n\t"
         "bge	L_aes_decrypt_blocks_arm32_crypto_192_start_4_%=\n\t"
         "\n"
-    "L_aes_decrypt_blocks_arm32_crypto_192_start_2_%=: \n\t"
+    "L_aes_decrypt_blocks_arm32_crypto_192_start_2_%=:\n\t"
         "cmp	%[sz], #2\n\t"
         "blt	L_aes_decrypt_blocks_arm32_crypto_192_start_1_%=\n\t"
         "vld1.8	{q12-q13}, [%[in]]!\n\t"
@@ -2033,7 +2031,7 @@ WC_OMIT_FRAME_POINTER void AES_decrypt_blocks_AARCH32(const byte* in, byte* out,
         "sub	%[sz], %[sz], #2\n\t"
         "vst1.8	{q12-q13}, [%[out]]!\n\t"
         "\n"
-    "L_aes_decrypt_blocks_arm32_crypto_192_start_1_%=: \n\t"
+    "L_aes_decrypt_blocks_arm32_crypto_192_start_1_%=:\n\t"
         "cmp	%[sz], #0\n\t"
         "beq	L_aes_decrypt_blocks_arm32_crypto_192_done_%=\n\t"
         "vld1.8	{q12}, [%[in]]!\n\t"
@@ -2067,12 +2065,12 @@ WC_OMIT_FRAME_POINTER void AES_decrypt_blocks_AARCH32(const byte* in, byte* out,
         "sub	%[key], %[key], #48\n\t"
         "vst1.8	{q12}, [%[out]]!\n\t"
         "\n"
-    "L_aes_decrypt_blocks_arm32_crypto_192_done_%=: \n\t"
+    "L_aes_decrypt_blocks_arm32_crypto_192_done_%=:\n\t"
 #endif /* !NO_AES_192 */
         "b	L_aes_decrypt_blocks_arm32_crypto_done_%=\n\t"
         /* AES_ECB_256 */
         "\n"
-    "L_aes_decrypt_blocks_arm32_crypto_start_256_%=: \n\t"
+    "L_aes_decrypt_blocks_arm32_crypto_start_256_%=:\n\t"
 #ifndef NO_AES_256
         "vld1.32	{q8-q9}, [%[key]]!\n\t"
         "cmp	%[sz], #1\n\t"
@@ -2080,7 +2078,7 @@ WC_OMIT_FRAME_POINTER void AES_decrypt_blocks_AARCH32(const byte* in, byte* out,
         "cmp	%[sz], #4\n\t"
         "blt	L_aes_decrypt_blocks_arm32_crypto_256_start_2_%=\n\t"
         "\n"
-    "L_aes_decrypt_blocks_arm32_crypto_256_start_4_%=: \n\t"
+    "L_aes_decrypt_blocks_arm32_crypto_256_start_4_%=:\n\t"
         "vldm.8	%[in]!, {q12-q15}\n\t"
         "aesd.8	q12, q0\n\t"
         "aesimc.8	q12, q12\n\t"
@@ -2205,7 +2203,7 @@ WC_OMIT_FRAME_POINTER void AES_decrypt_blocks_AARCH32(const byte* in, byte* out,
         "cmp	%[sz], #4\n\t"
         "bge	L_aes_decrypt_blocks_arm32_crypto_256_start_4_%=\n\t"
         "\n"
-    "L_aes_decrypt_blocks_arm32_crypto_256_start_2_%=: \n\t"
+    "L_aes_decrypt_blocks_arm32_crypto_256_start_2_%=:\n\t"
         "cmp	%[sz], #2\n\t"
         "blt	L_aes_decrypt_blocks_arm32_crypto_256_start_1_%=\n\t"
         "vld1.8	{q12-q13}, [%[in]]!\n\t"
@@ -2274,7 +2272,7 @@ WC_OMIT_FRAME_POINTER void AES_decrypt_blocks_AARCH32(const byte* in, byte* out,
         "sub	%[sz], %[sz], #2\n\t"
         "vst1.8	{q12-q13}, [%[out]]!\n\t"
         "\n"
-    "L_aes_decrypt_blocks_arm32_crypto_256_start_1_%=: \n\t"
+    "L_aes_decrypt_blocks_arm32_crypto_256_start_1_%=:\n\t"
         "cmp	%[sz], #0\n\t"
         "beq	L_aes_decrypt_blocks_arm32_crypto_256_done_%=\n\t"
         "vld1.8	{q12}, [%[in]]!\n\t"
@@ -2314,12 +2312,12 @@ WC_OMIT_FRAME_POINTER void AES_decrypt_blocks_AARCH32(const byte* in, byte* out,
         "sub	%[key], %[key], #0x50\n\t"
         "vst1.8	{q12}, [%[out]]!\n\t"
         "\n"
-    "L_aes_decrypt_blocks_arm32_crypto_256_done_%=: \n\t"
+    "L_aes_decrypt_blocks_arm32_crypto_256_done_%=:\n\t"
 #endif /* !NO_AES_256 */
         "b	L_aes_decrypt_blocks_arm32_crypto_done_%=\n\t"
         /* AES_ECB_128 */
         "\n"
-    "L_aes_decrypt_blocks_arm32_crypto_start_128_%=: \n\t"
+    "L_aes_decrypt_blocks_arm32_crypto_start_128_%=:\n\t"
 #ifndef NO_AES_128
         "vldm.32	%[key]!, {q8-q10}\n\t"
         "cmp	%[sz], #1\n\t"
@@ -2327,7 +2325,7 @@ WC_OMIT_FRAME_POINTER void AES_decrypt_blocks_AARCH32(const byte* in, byte* out,
         "cmp	%[sz], #4\n\t"
         "blt	L_aes_decrypt_blocks_arm32_crypto_128_start_2_%=\n\t"
         "\n"
-    "L_aes_decrypt_blocks_arm32_crypto_128_start_4_%=: \n\t"
+    "L_aes_decrypt_blocks_arm32_crypto_128_start_4_%=:\n\t"
         "vldm.8	%[in]!, {q12-q15}\n\t"
         "aesd.8	q12, q0\n\t"
         "aesimc.8	q12, q12\n\t"
@@ -2414,7 +2412,7 @@ WC_OMIT_FRAME_POINTER void AES_decrypt_blocks_AARCH32(const byte* in, byte* out,
         "cmp	%[sz], #4\n\t"
         "bge	L_aes_decrypt_blocks_arm32_crypto_128_start_4_%=\n\t"
         "\n"
-    "L_aes_decrypt_blocks_arm32_crypto_128_start_2_%=: \n\t"
+    "L_aes_decrypt_blocks_arm32_crypto_128_start_2_%=:\n\t"
         "cmp	%[sz], #2\n\t"
         "blt	L_aes_decrypt_blocks_arm32_crypto_128_start_1_%=\n\t"
         "vld1.8	{q12-q13}, [%[in]]!\n\t"
@@ -2461,7 +2459,7 @@ WC_OMIT_FRAME_POINTER void AES_decrypt_blocks_AARCH32(const byte* in, byte* out,
         "sub	%[sz], %[sz], #2\n\t"
         "vst1.8	{q12-q13}, [%[out]]!\n\t"
         "\n"
-    "L_aes_decrypt_blocks_arm32_crypto_128_start_1_%=: \n\t"
+    "L_aes_decrypt_blocks_arm32_crypto_128_start_1_%=:\n\t"
         "cmp	%[sz], #0\n\t"
         "beq	L_aes_decrypt_blocks_arm32_crypto_128_done_%=\n\t"
         "vld1.8	{q12}, [%[in]]!\n\t"
@@ -2487,10 +2485,10 @@ WC_OMIT_FRAME_POINTER void AES_decrypt_blocks_AARCH32(const byte* in, byte* out,
         "veor.32	q12, q12, q10\n\t"
         "vst1.8	{q12}, [%[out]]!\n\t"
         "\n"
-    "L_aes_decrypt_blocks_arm32_crypto_128_done_%=: \n\t"
+    "L_aes_decrypt_blocks_arm32_crypto_128_done_%=:\n\t"
 #endif /* !NO_AES_128 */
         "\n"
-    "L_aes_decrypt_blocks_arm32_crypto_done_%=: \n\t"
+    "L_aes_decrypt_blocks_arm32_crypto_done_%=:\n\t"
         "pop	{%[nr]}\n\t"
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [in] "+r" (in), [out] "+r" (out), [sz] "+r" (sz), [key] "+r" (key),
@@ -2515,15 +2513,15 @@ WC_OMIT_FRAME_POINTER void AES_CBC_encrypt_AARCH32(const byte* in_p,
 #else
 WC_OMIT_FRAME_POINTER void AES_CBC_encrypt_AARCH32(const byte* in, byte* out,
     word32 sz, byte* reg, byte* key, int nr)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register const byte* in asm ("r0") = (const byte*)in_p;
-    register byte* out asm ("r1") = (byte*)out_p;
-    register word32 sz asm ("r2") = (word32)sz_p;
-    register byte* reg asm ("r3") = (byte*)reg_p;
-    register byte* key asm ("r12") = (byte*)key_p;
-    register int nr asm ("lr") = (int)nr_p;
+    register const byte* in __asm__ ("r0") = (const byte*)in_p;
+    register byte* out __asm__ ("r1") = (byte*)out_p;
+    register word32 sz __asm__ ("r2") = (word32)sz_p;
+    register byte* reg __asm__ ("r3") = (byte*)reg_p;
+    register byte* key __asm__ ("r12") = (byte*)key_p;
+    register int nr __asm__ ("lr") = (int)nr_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -2545,7 +2543,7 @@ WC_OMIT_FRAME_POINTER void AES_CBC_encrypt_AARCH32(const byte* in, byte* out,
         "cmp	%[sz], #4\n\t"
         "blt	L_aes_cbc_encrypt_arm32_crypto_192_start_2_%=\n\t"
         "\n"
-    "L_aes_cbc_encrypt_arm32_crypto_192_start_4_%=: \n\t"
+    "L_aes_cbc_encrypt_arm32_crypto_192_start_4_%=:\n\t"
         "veor.32	q15, q15, q14\n\t"
         "aese.8	q15, q0\n\t"
         "aesmc.8	q15, q15\n\t"
@@ -2661,7 +2659,7 @@ WC_OMIT_FRAME_POINTER void AES_CBC_encrypt_AARCH32(const byte* in, byte* out,
         "cmp	%[sz], #2\n\t"
         "blt	L_aes_cbc_encrypt_arm32_crypto_192_start_1_%=\n\t"
         "\n"
-    "L_aes_cbc_encrypt_arm32_crypto_192_start_2_%=: \n\t"
+    "L_aes_cbc_encrypt_arm32_crypto_192_start_2_%=:\n\t"
         "veor.32	q15, q15, q14\n\t"
         "aese.8	q15, q0\n\t"
         "aesmc.8	q15, q15\n\t"
@@ -2719,7 +2717,7 @@ WC_OMIT_FRAME_POINTER void AES_CBC_encrypt_AARCH32(const byte* in, byte* out,
         "vld1.8	{q14}, [%[in]]!\n\t"
         "vst1.8	{q15}, [%[out]]!\n\t"
         "\n"
-    "L_aes_cbc_encrypt_arm32_crypto_192_start_1_%=: \n\t"
+    "L_aes_cbc_encrypt_arm32_crypto_192_start_1_%=:\n\t"
         "veor.32	q15, q15, q14\n\t"
         "aese.8	q15, q0\n\t"
         "aesmc.8	q15, q15\n\t"
@@ -2746,13 +2744,13 @@ WC_OMIT_FRAME_POINTER void AES_CBC_encrypt_AARCH32(const byte* in, byte* out,
         "aese.8	q15, q11\n\t"
         "veor.32	q15, q15, q12\n\t"
         "\n"
-    "L_aes_cbc_encrypt_arm32_crypto_192_done_%=: \n\t"
+    "L_aes_cbc_encrypt_arm32_crypto_192_done_%=:\n\t"
         "vst1.8	{q15}, [%[out]]!\n\t"
 #endif /* !NO_AES_192 */
         "b	L_aes_cbc_encrypt_arm32_crypto_done_%=\n\t"
         /* AES_CBC_256 */
         "\n"
-    "L_aes_cbc_encrypt_arm32_crypto_start_256_%=: \n\t"
+    "L_aes_cbc_encrypt_arm32_crypto_start_256_%=:\n\t"
 #ifndef NO_AES_256
         "vld1.8	{q14}, [%[in]]!\n\t"
         "vldm.32	r12!, {q8-q11}\n\t"
@@ -2764,7 +2762,7 @@ WC_OMIT_FRAME_POINTER void AES_CBC_encrypt_AARCH32(const byte* in, byte* out,
         "cmp	%[sz], #4\n\t"
         "blt	L_aes_cbc_encrypt_arm32_crypto_256_start_2_%=\n\t"
         "\n"
-    "L_aes_cbc_encrypt_arm32_crypto_256_start_4_%=: \n\t"
+    "L_aes_cbc_encrypt_arm32_crypto_256_start_4_%=:\n\t"
         "veor.32	q15, q15, q14\n\t"
         "aese.8	q15, q0\n\t"
         "aesmc.8	q15, q15\n\t"
@@ -2900,7 +2898,7 @@ WC_OMIT_FRAME_POINTER void AES_CBC_encrypt_AARCH32(const byte* in, byte* out,
         "cmp	%[sz], #2\n\t"
         "blt	L_aes_cbc_encrypt_arm32_crypto_256_start_1_%=\n\t"
         "\n"
-    "L_aes_cbc_encrypt_arm32_crypto_256_start_2_%=: \n\t"
+    "L_aes_cbc_encrypt_arm32_crypto_256_start_2_%=:\n\t"
         "veor.32	q15, q15, q14\n\t"
         "aese.8	q15, q0\n\t"
         "aesmc.8	q15, q15\n\t"
@@ -2968,7 +2966,7 @@ WC_OMIT_FRAME_POINTER void AES_CBC_encrypt_AARCH32(const byte* in, byte* out,
         "vld1.8	{q14}, [%[in]]!\n\t"
         "vst1.8	{q15}, [%[out]]!\n\t"
         "\n"
-    "L_aes_cbc_encrypt_arm32_crypto_256_start_1_%=: \n\t"
+    "L_aes_cbc_encrypt_arm32_crypto_256_start_1_%=:\n\t"
         "veor.32	q15, q15, q14\n\t"
         "aese.8	q15, q0\n\t"
         "aesmc.8	q15, q15\n\t"
@@ -3000,13 +2998,13 @@ WC_OMIT_FRAME_POINTER void AES_CBC_encrypt_AARCH32(const byte* in, byte* out,
         "aese.8	q15, q12\n\t"
         "veor.32	q15, q15, q13\n\t"
         "\n"
-    "L_aes_cbc_encrypt_arm32_crypto_256_done_%=: \n\t"
+    "L_aes_cbc_encrypt_arm32_crypto_256_done_%=:\n\t"
         "vst1.8	{q15}, [%[out]]!\n\t"
 #endif /* !NO_AES_256 */
         "b	L_aes_cbc_encrypt_arm32_crypto_done_%=\n\t"
         /* AES_CBC_128 */
         "\n"
-    "L_aes_cbc_encrypt_arm32_crypto_start_128_%=: \n\t"
+    "L_aes_cbc_encrypt_arm32_crypto_start_128_%=:\n\t"
 #ifndef NO_AES_128
         "vld1.8	{q14}, [%[in]]!\n\t"
         "vldm.32	r12!, {q8-q10}\n\t"
@@ -3015,7 +3013,7 @@ WC_OMIT_FRAME_POINTER void AES_CBC_encrypt_AARCH32(const byte* in, byte* out,
         "cmp	%[sz], #4\n\t"
         "blt	L_aes_cbc_encrypt_arm32_crypto_128_start_2_%=\n\t"
         "\n"
-    "L_aes_cbc_encrypt_arm32_crypto_128_start_4_%=: \n\t"
+    "L_aes_cbc_encrypt_arm32_crypto_128_start_4_%=:\n\t"
         "veor.32	q15, q15, q14\n\t"
         "aese.8	q15, q0\n\t"
         "aesmc.8	q15, q15\n\t"
@@ -3115,7 +3113,7 @@ WC_OMIT_FRAME_POINTER void AES_CBC_encrypt_AARCH32(const byte* in, byte* out,
         "cmp	%[sz], #2\n\t"
         "blt	L_aes_cbc_encrypt_arm32_crypto_128_start_1_%=\n\t"
         "\n"
-    "L_aes_cbc_encrypt_arm32_crypto_128_start_2_%=: \n\t"
+    "L_aes_cbc_encrypt_arm32_crypto_128_start_2_%=:\n\t"
         "veor.32	q15, q15, q14\n\t"
         "aese.8	q15, q0\n\t"
         "aesmc.8	q15, q15\n\t"
@@ -3165,7 +3163,7 @@ WC_OMIT_FRAME_POINTER void AES_CBC_encrypt_AARCH32(const byte* in, byte* out,
         "vld1.8	{q14}, [%[in]]!\n\t"
         "vst1.8	{q15}, [%[out]]!\n\t"
         "\n"
-    "L_aes_cbc_encrypt_arm32_crypto_128_start_1_%=: \n\t"
+    "L_aes_cbc_encrypt_arm32_crypto_128_start_1_%=:\n\t"
         "veor.32	q15, q15, q14\n\t"
         "aese.8	q15, q0\n\t"
         "aesmc.8	q15, q15\n\t"
@@ -3188,11 +3186,11 @@ WC_OMIT_FRAME_POINTER void AES_CBC_encrypt_AARCH32(const byte* in, byte* out,
         "aese.8	q15, q9\n\t"
         "veor.32	q15, q15, q10\n\t"
         "\n"
-    "L_aes_cbc_encrypt_arm32_crypto_128_done_%=: \n\t"
+    "L_aes_cbc_encrypt_arm32_crypto_128_done_%=:\n\t"
         "vst1.8	{q15}, [%[out]]!\n\t"
 #endif /* !NO_AES_128 */
         "\n"
-    "L_aes_cbc_encrypt_arm32_crypto_done_%=: \n\t"
+    "L_aes_cbc_encrypt_arm32_crypto_done_%=:\n\t"
         "vst1.32	{q15}, [%[reg]]\n\t"
         "pop	{%[key], %[nr]}\n\t"
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
@@ -3216,15 +3214,15 @@ WC_OMIT_FRAME_POINTER void AES_CBC_decrypt_AARCH32(const byte* in_p,
 #else
 WC_OMIT_FRAME_POINTER void AES_CBC_decrypt_AARCH32(const byte* in, byte* out,
     word32 sz, byte* reg, byte* key, int nr)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register const byte* in asm ("r0") = (const byte*)in_p;
-    register byte* out asm ("r1") = (byte*)out_p;
-    register word32 sz asm ("r2") = (word32)sz_p;
-    register byte* reg asm ("r3") = (byte*)reg_p;
-    register byte* key asm ("r12") = (byte*)key_p;
-    register int nr asm ("lr") = (int)nr_p;
+    register const byte* in __asm__ ("r0") = (const byte*)in_p;
+    register byte* out __asm__ ("r1") = (byte*)out_p;
+    register word32 sz __asm__ ("r2") = (word32)sz_p;
+    register byte* reg __asm__ ("r3") = (byte*)reg_p;
+    register byte* key __asm__ ("r12") = (byte*)key_p;
+    register int nr __asm__ ("lr") = (int)nr_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -3243,7 +3241,7 @@ WC_OMIT_FRAME_POINTER void AES_CBC_decrypt_AARCH32(const byte* in, byte* out,
         "cmp	%[sz], #1\n\t"
         "beq	L_aes_cbc_decrypt_blocks_arm32_crypto_192_start_1_%=\n\t"
         "\n"
-    "L_aes_cbc_decrypt_blocks_arm32_crypto_192_start_2_%=: \n\t"
+    "L_aes_cbc_decrypt_blocks_arm32_crypto_192_start_2_%=:\n\t"
         "vld1.8	{q14-q15}, [%[in]]!\n\t"
         "vmov	q11, q13\n\t"
         "vmov	q12, q14\n\t"
@@ -3309,7 +3307,7 @@ WC_OMIT_FRAME_POINTER void AES_CBC_decrypt_AARCH32(const byte* in, byte* out,
         "blt	L_aes_cbc_decrypt_blocks_arm32_crypto_192_done_%=\n\t"
         "bgt	L_aes_cbc_decrypt_blocks_arm32_crypto_192_start_2_%=\n\t"
         "\n"
-    "L_aes_cbc_decrypt_blocks_arm32_crypto_192_start_1_%=: \n\t"
+    "L_aes_cbc_decrypt_blocks_arm32_crypto_192_start_1_%=:\n\t"
         "vld1.8	{q14}, [%[in]]!\n\t"
         "vmov	q11, q13\n\t"
         "vmov	q13, q14\n\t"
@@ -3344,18 +3342,18 @@ WC_OMIT_FRAME_POINTER void AES_CBC_decrypt_AARCH32(const byte* in, byte* out,
         "veor.32	q14, q14, q11\n\t"
         "vst1.8	{q14}, [%[out]]!\n\t"
         "\n"
-    "L_aes_cbc_decrypt_blocks_arm32_crypto_192_done_%=: \n\t"
+    "L_aes_cbc_decrypt_blocks_arm32_crypto_192_done_%=:\n\t"
 #endif /* !NO_AES_192 */
         "b	L_aes_cbc_decrypt_blocks_arm32_crypto_done_%=\n\t"
         /* AES_CBC_256 */
         "\n"
-    "L_aes_cbc_decrypt_blocks_arm32_crypto_start_256_%=: \n\t"
+    "L_aes_cbc_decrypt_blocks_arm32_crypto_start_256_%=:\n\t"
 #ifndef NO_AES_256
         "vld1.32	{q8}, [r12]!\n\t"
         "cmp	%[sz], #1\n\t"
         "beq	L_aes_cbc_decrypt_blocks_arm32_crypto_256_start_1_%=\n\t"
         "\n"
-    "L_aes_cbc_decrypt_blocks_arm32_crypto_256_start_2_%=: \n\t"
+    "L_aes_cbc_decrypt_blocks_arm32_crypto_256_start_2_%=:\n\t"
         "vld1.8	{q14-q15}, [%[in]]!\n\t"
         "vmov	q11, q13\n\t"
         "vmov	q12, q14\n\t"
@@ -3431,7 +3429,7 @@ WC_OMIT_FRAME_POINTER void AES_CBC_decrypt_AARCH32(const byte* in, byte* out,
         "blt	L_aes_cbc_decrypt_blocks_arm32_crypto_256_done_%=\n\t"
         "bgt	L_aes_cbc_decrypt_blocks_arm32_crypto_256_start_2_%=\n\t"
         "\n"
-    "L_aes_cbc_decrypt_blocks_arm32_crypto_256_start_1_%=: \n\t"
+    "L_aes_cbc_decrypt_blocks_arm32_crypto_256_start_1_%=:\n\t"
         "vld1.8	{q14}, [%[in]]!\n\t"
         "vmov	q11, q13\n\t"
         "vmov	q13, q14\n\t"
@@ -3472,18 +3470,18 @@ WC_OMIT_FRAME_POINTER void AES_CBC_decrypt_AARCH32(const byte* in, byte* out,
         "veor.32	q14, q14, q11\n\t"
         "vst1.8	{q14}, [%[out]]!\n\t"
         "\n"
-    "L_aes_cbc_decrypt_blocks_arm32_crypto_256_done_%=: \n\t"
+    "L_aes_cbc_decrypt_blocks_arm32_crypto_256_done_%=:\n\t"
 #endif /* !NO_AES_256 */
         "b	L_aes_cbc_decrypt_blocks_arm32_crypto_done_%=\n\t"
         /* AES_CBC_128 */
         "\n"
-    "L_aes_cbc_decrypt_blocks_arm32_crypto_start_128_%=: \n\t"
+    "L_aes_cbc_decrypt_blocks_arm32_crypto_start_128_%=:\n\t"
 #ifndef NO_AES_128
         "vldm.32	r12!, {q8-q10}\n\t"
         "cmp	%[sz], #1\n\t"
         "beq	L_aes_cbc_decrypt_blocks_arm32_crypto_128_start_1_%=\n\t"
         "\n"
-    "L_aes_cbc_decrypt_blocks_arm32_crypto_128_start_2_%=: \n\t"
+    "L_aes_cbc_decrypt_blocks_arm32_crypto_128_start_2_%=:\n\t"
         "vld1.8	{q14-q15}, [%[in]]!\n\t"
         "vmov	q11, q13\n\t"
         "vmov	q12, q14\n\t"
@@ -3536,7 +3534,7 @@ WC_OMIT_FRAME_POINTER void AES_CBC_decrypt_AARCH32(const byte* in, byte* out,
         "blt	L_aes_cbc_decrypt_blocks_arm32_crypto_128_done_%=\n\t"
         "bgt	L_aes_cbc_decrypt_blocks_arm32_crypto_128_start_2_%=\n\t"
         "\n"
-    "L_aes_cbc_decrypt_blocks_arm32_crypto_128_start_1_%=: \n\t"
+    "L_aes_cbc_decrypt_blocks_arm32_crypto_128_start_1_%=:\n\t"
         "vld1.8	{q14}, [%[in]]!\n\t"
         "vmov	q11, q13\n\t"
         "vmov	q13, q14\n\t"
@@ -3563,10 +3561,10 @@ WC_OMIT_FRAME_POINTER void AES_CBC_decrypt_AARCH32(const byte* in, byte* out,
         "veor.32	q14, q14, q11\n\t"
         "vst1.8	{q14}, [%[out]]!\n\t"
         "\n"
-    "L_aes_cbc_decrypt_blocks_arm32_crypto_128_done_%=: \n\t"
+    "L_aes_cbc_decrypt_blocks_arm32_crypto_128_done_%=:\n\t"
 #endif /* !NO_AES_128 */
         "\n"
-    "L_aes_cbc_decrypt_blocks_arm32_crypto_done_%=: \n\t"
+    "L_aes_cbc_decrypt_blocks_arm32_crypto_done_%=:\n\t"
         "vst1.32	{q13}, [%[reg]]\n\t"
         "pop	{%[key], %[nr]}\n\t"
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
@@ -3593,17 +3591,17 @@ WC_OMIT_FRAME_POINTER void AES_CTR_encrypt_AARCH32(const byte* in_p,
 #else
 WC_OMIT_FRAME_POINTER void AES_CTR_encrypt_AARCH32(const byte* in, byte* out,
     word32 sz, byte* reg, byte* key, byte* tmp, word32* left, word32 nr)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register const byte* in asm ("r0") = (const byte*)in_p;
-    register byte* out asm ("r1") = (byte*)out_p;
-    register word32 sz asm ("r2") = (word32)sz_p;
-    register byte* reg asm ("r3") = (byte*)reg_p;
-    register byte* key asm ("r12") = (byte*)key_p;
-    register byte* tmp asm ("lr") = (byte*)tmp_p;
-    register word32* left asm ("r4") = (word32*)left_p;
-    register word32 nr asm ("r5") = (word32)nr_p;
+    register const byte* in __asm__ ("r0") = (const byte*)in_p;
+    register byte* out __asm__ ("r1") = (byte*)out_p;
+    register word32 sz __asm__ ("r2") = (word32)sz_p;
+    register byte* reg __asm__ ("r3") = (byte*)reg_p;
+    register byte* key __asm__ ("r12") = (byte*)key_p;
+    register byte* tmp __asm__ ("lr") = (byte*)tmp_p;
+    register word32* left __asm__ ("r4") = (word32*)left_p;
+    register word32 nr __asm__ ("r5") = (word32)nr_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -3636,7 +3634,7 @@ WC_OMIT_FRAME_POINTER void AES_CTR_encrypt_AARCH32(const byte* in, byte* out,
         "vmov	d2, r5, r6\n\t"
         "vrev32.8	q1, q1\n\t"
         "\n"
-    "L_aes_ctr_encrypt_arm32_crypto_192_start_2_%=: \n\t"
+    "L_aes_ctr_encrypt_arm32_crypto_192_start_2_%=:\n\t"
         "aese.8	q0, q3\n\t"
         "aesmc.8	q0, q0\n\t"
         "aese.8	q1, q3\n\t"
@@ -3712,7 +3710,7 @@ WC_OMIT_FRAME_POINTER void AES_CTR_encrypt_AARCH32(const byte* in, byte* out,
         "mov	lr, #0\n\t"
         "blt	L_aes_ctr_encrypt_arm32_crypto_192_done_%=\n\t"
         "\n"
-    "L_aes_ctr_encrypt_arm32_crypto_192_start_1_%=: \n\t"
+    "L_aes_ctr_encrypt_arm32_crypto_192_start_1_%=:\n\t"
         "aese.8	q0, q3\n\t"
         "aesmc.8	q0, q0\n\t"
         "adds	r8, r8, lr\n\t"
@@ -3752,7 +3750,7 @@ WC_OMIT_FRAME_POINTER void AES_CTR_encrypt_AARCH32(const byte* in, byte* out,
         "mov	lr, #1\n\t"
         "vrev32.8	q0, q2\n\t"
         "\n"
-    "L_aes_ctr_encrypt_arm32_crypto_192_done_%=: \n\t"
+    "L_aes_ctr_encrypt_arm32_crypto_192_done_%=:\n\t"
         "cmp	%[sz], #0\n\t"
         "beq	L_aes_ctr_encrypt_arm32_crypto_192_partial_done_%=\n\t"
         "ldr	r4, [sp, #8]\n\t"
@@ -3794,7 +3792,7 @@ WC_OMIT_FRAME_POINTER void AES_CTR_encrypt_AARCH32(const byte* in, byte* out,
         "mov	r5, #16\n\t"
         "sub	r5, r5, %[sz]\n\t"
         "\n"
-    "L_aes_ctr_encrypt_arm32_crypto_192_start_byte_%=: \n\t"
+    "L_aes_ctr_encrypt_arm32_crypto_192_start_byte_%=:\n\t"
         "ldrb	r7, [lr], #1\n\t"
         "ldrb	r8, [%[in]], #1\n\t"
         "eor	r7, r7, r8\n\t"
@@ -3804,12 +3802,12 @@ WC_OMIT_FRAME_POINTER void AES_CTR_encrypt_AARCH32(const byte* in, byte* out,
         "vrev32.8	q0, q2\n\t"
         "str	r5, [r4]\n\t"
         "\n"
-    "L_aes_ctr_encrypt_arm32_crypto_192_partial_done_%=: \n\t"
+    "L_aes_ctr_encrypt_arm32_crypto_192_partial_done_%=:\n\t"
 #endif /* !NO_AES_192 */
         "b	L_aes_ctr_encrypt_arm32_crypto_done_%=\n\t"
         /* AES_CTR_256 */
         "\n"
-    "L_aes_ctr_encrypt_arm32_crypto_start_256_%=: \n\t"
+    "L_aes_ctr_encrypt_arm32_crypto_start_256_%=:\n\t"
 #ifndef NO_AES_256
         "vldm.32	r12!, {q11-q13}\n\t"
         "mov	lr, #1\n\t"
@@ -3824,7 +3822,7 @@ WC_OMIT_FRAME_POINTER void AES_CTR_encrypt_AARCH32(const byte* in, byte* out,
         "vmov	d2, r5, r6\n\t"
         "vrev32.8	q1, q1\n\t"
         "\n"
-    "L_aes_ctr_encrypt_arm32_crypto_256_start_2_%=: \n\t"
+    "L_aes_ctr_encrypt_arm32_crypto_256_start_2_%=:\n\t"
         "aese.8	q0, q3\n\t"
         "aesmc.8	q0, q0\n\t"
         "aese.8	q1, q3\n\t"
@@ -3910,7 +3908,7 @@ WC_OMIT_FRAME_POINTER void AES_CTR_encrypt_AARCH32(const byte* in, byte* out,
         "mov	lr, #0\n\t"
         "blt	L_aes_ctr_encrypt_arm32_crypto_256_done_%=\n\t"
         "\n"
-    "L_aes_ctr_encrypt_arm32_crypto_256_start_1_%=: \n\t"
+    "L_aes_ctr_encrypt_arm32_crypto_256_start_1_%=:\n\t"
         "aese.8	q0, q3\n\t"
         "aesmc.8	q0, q0\n\t"
         "adds	r8, r8, lr\n\t"
@@ -3956,7 +3954,7 @@ WC_OMIT_FRAME_POINTER void AES_CTR_encrypt_AARCH32(const byte* in, byte* out,
         "mov	lr, #1\n\t"
         "vrev32.8	q0, q2\n\t"
         "\n"
-    "L_aes_ctr_encrypt_arm32_crypto_256_done_%=: \n\t"
+    "L_aes_ctr_encrypt_arm32_crypto_256_done_%=:\n\t"
         "cmp	%[sz], #0\n\t"
         "beq	L_aes_ctr_encrypt_arm32_crypto_256_partial_done_%=\n\t"
         "ldr	r4, [sp, #8]\n\t"
@@ -4004,7 +4002,7 @@ WC_OMIT_FRAME_POINTER void AES_CTR_encrypt_AARCH32(const byte* in, byte* out,
         "mov	r5, #16\n\t"
         "sub	r5, r5, %[sz]\n\t"
         "\n"
-    "L_aes_ctr_encrypt_arm32_crypto_256_start_byte_%=: \n\t"
+    "L_aes_ctr_encrypt_arm32_crypto_256_start_byte_%=:\n\t"
         "ldrb	r7, [lr], #1\n\t"
         "ldrb	r8, [%[in]], #1\n\t"
         "eor	r7, r7, r8\n\t"
@@ -4014,12 +4012,12 @@ WC_OMIT_FRAME_POINTER void AES_CTR_encrypt_AARCH32(const byte* in, byte* out,
         "vrev32.8	q0, q2\n\t"
         "str	r5, [r4]\n\t"
         "\n"
-    "L_aes_ctr_encrypt_arm32_crypto_256_partial_done_%=: \n\t"
+    "L_aes_ctr_encrypt_arm32_crypto_256_partial_done_%=:\n\t"
 #endif /* !NO_AES_256 */
         "b	L_aes_ctr_encrypt_arm32_crypto_done_%=\n\t"
         /* AES_CTR_128 */
         "\n"
-    "L_aes_ctr_encrypt_arm32_crypto_start_128_%=: \n\t"
+    "L_aes_ctr_encrypt_arm32_crypto_start_128_%=:\n\t"
 #ifndef NO_AES_128
         "vldm.32	r12!, {q11-q13}\n\t"
         "mov	lr, #1\n\t"
@@ -4034,7 +4032,7 @@ WC_OMIT_FRAME_POINTER void AES_CTR_encrypt_AARCH32(const byte* in, byte* out,
         "vmov	d2, r5, r6\n\t"
         "vrev32.8	q1, q1\n\t"
         "\n"
-    "L_aes_ctr_encrypt_arm32_crypto_128_start_2_%=: \n\t"
+    "L_aes_ctr_encrypt_arm32_crypto_128_start_2_%=:\n\t"
         "aese.8	q0, q3\n\t"
         "aesmc.8	q0, q0\n\t"
         "aese.8	q1, q3\n\t"
@@ -4099,7 +4097,7 @@ WC_OMIT_FRAME_POINTER void AES_CTR_encrypt_AARCH32(const byte* in, byte* out,
         "mov	lr, #0\n\t"
         "blt	L_aes_ctr_encrypt_arm32_crypto_128_done_%=\n\t"
         "\n"
-    "L_aes_ctr_encrypt_arm32_crypto_128_start_1_%=: \n\t"
+    "L_aes_ctr_encrypt_arm32_crypto_128_start_1_%=:\n\t"
         "aese.8	q0, q3\n\t"
         "aesmc.8	q0, q0\n\t"
         "adds	r8, r8, lr\n\t"
@@ -4132,7 +4130,7 @@ WC_OMIT_FRAME_POINTER void AES_CTR_encrypt_AARCH32(const byte* in, byte* out,
         "mov	lr, #1\n\t"
         "vrev32.8	q0, q2\n\t"
         "\n"
-    "L_aes_ctr_encrypt_arm32_crypto_128_done_%=: \n\t"
+    "L_aes_ctr_encrypt_arm32_crypto_128_done_%=:\n\t"
         "cmp	%[sz], #0\n\t"
         "beq	L_aes_ctr_encrypt_arm32_crypto_128_partial_done_%=\n\t"
         "ldr	r4, [sp, #8]\n\t"
@@ -4168,7 +4166,7 @@ WC_OMIT_FRAME_POINTER void AES_CTR_encrypt_AARCH32(const byte* in, byte* out,
         "mov	r5, #16\n\t"
         "sub	r5, r5, %[sz]\n\t"
         "\n"
-    "L_aes_ctr_encrypt_arm32_crypto_128_start_byte_%=: \n\t"
+    "L_aes_ctr_encrypt_arm32_crypto_128_start_byte_%=:\n\t"
         "ldrb	r7, [lr], #1\n\t"
         "ldrb	r8, [%[in]], #1\n\t"
         "eor	r7, r7, r8\n\t"
@@ -4178,10 +4176,10 @@ WC_OMIT_FRAME_POINTER void AES_CTR_encrypt_AARCH32(const byte* in, byte* out,
         "vrev32.8	q0, q2\n\t"
         "str	r5, [r4]\n\t"
         "\n"
-    "L_aes_ctr_encrypt_arm32_crypto_128_partial_done_%=: \n\t"
+    "L_aes_ctr_encrypt_arm32_crypto_128_partial_done_%=:\n\t"
 #endif /* !NO_AES_128 */
         "\n"
-    "L_aes_ctr_encrypt_arm32_crypto_done_%=: \n\t"
+    "L_aes_ctr_encrypt_arm32_crypto_done_%=:\n\t"
         "vst1.32	{q0}, [%[reg]]\n\t"
         "pop	{%[key], %[tmp]}\n\t"
         "pop	{%[left], %[nr]}\n\t"
@@ -4208,13 +4206,13 @@ WC_OMIT_FRAME_POINTER void AES_GCM_set_key_AARCH32(const byte* nonce_p,
 #else
 WC_OMIT_FRAME_POINTER void AES_GCM_set_key_AARCH32(const byte* nonce,
     const byte* key, byte* gcm_h, int nr)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register const byte* nonce asm ("r0") = (const byte*)nonce_p;
-    register const byte* key asm ("r1") = (const byte*)key_p;
-    register byte* gcm_h asm ("r2") = (byte*)gcm_h_p;
-    register int nr asm ("r3") = (int)nr_p;
+    register const byte* nonce __asm__ ("r0") = (const byte*)nonce_p;
+    register const byte* key __asm__ ("r1") = (const byte*)key_p;
+    register byte* gcm_h __asm__ ("r2") = (byte*)gcm_h_p;
+    register int nr __asm__ ("r3") = (int)nr_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -4258,7 +4256,7 @@ WC_OMIT_FRAME_POINTER void AES_GCM_set_key_AARCH32(const byte* nonce,
         "aesmc.8	q0, q0\n\t"
         "aese.8	q0, q2\n\t"
         "\n"
-    "L_aes_gcm_set_key_arm32_crypto_round_done_%=: \n\t"
+    "L_aes_gcm_set_key_arm32_crypto_round_done_%=:\n\t"
         "vld1.8	{q1}, [%[key]]\n\t"
         "veor	q0, q0, q1\n\t"
         "vmov.i8	q1, #0x55\n\t"
@@ -4295,23 +4293,23 @@ WC_OMIT_FRAME_POINTER void AES_GCM_encrypt_AARCH32(const byte* in, byte* out,
     word32 sz, const byte* nonce, word32 nonceSz, byte* tag, word32 tagSz,
     const byte* aad, word32 aadSz, byte* key, byte* gcm_h, byte* tmp, byte* reg,
     int nr)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register const byte* in asm ("r0") = (const byte*)in_p;
-    register byte* out asm ("r1") = (byte*)out_p;
-    register word32 sz asm ("r2") = (word32)sz_p;
-    register const byte* nonce asm ("r3") = (const byte*)nonce_p;
-    register word32 nonceSz asm ("r12") = (word32)nonceSz_p;
-    register byte* tag asm ("lr") = (byte*)tag_p;
-    register word32 tagSz asm ("r4") = (word32)tagSz_p;
-    register const byte* aad asm ("r5") = (const byte*)aad_p;
-    register word32 aadSz asm ("r6") = (word32)aadSz_p;
-    register byte* key asm ("r7") = (byte*)key_p;
-    register byte* gcm_h asm ("r8") = (byte*)gcm_h_p;
-    register byte* tmp asm ("r9") = (byte*)tmp_p;
-    register byte* reg asm ("r10") = (byte*)reg_p;
-    register int nr asm ("r11") = (int)nr_p;
+    register const byte* in __asm__ ("r0") = (const byte*)in_p;
+    register byte* out __asm__ ("r1") = (byte*)out_p;
+    register word32 sz __asm__ ("r2") = (word32)sz_p;
+    register const byte* nonce __asm__ ("r3") = (const byte*)nonce_p;
+    register word32 nonceSz __asm__ ("r12") = (word32)nonceSz_p;
+    register byte* tag __asm__ ("lr") = (byte*)tag_p;
+    register word32 tagSz __asm__ ("r4") = (word32)tagSz_p;
+    register const byte* aad __asm__ ("r5") = (const byte*)aad_p;
+    register word32 aadSz __asm__ ("r6") = (word32)aadSz_p;
+    register byte* key __asm__ ("r7") = (byte*)key_p;
+    register byte* gcm_h __asm__ ("r8") = (byte*)gcm_h_p;
+    register byte* tmp __asm__ ("r9") = (byte*)tmp_p;
+    register byte* reg __asm__ ("r10") = (byte*)reg_p;
+    register int nr __asm__ ("r11") = (int)nr_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -4332,7 +4330,7 @@ WC_OMIT_FRAME_POINTER void AES_GCM_encrypt_AARCH32(const byte* in, byte* out,
         "vshr.u64	q13, q13, #56\n\t"
         "vld1.32	{q8}, [r8]\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_nonce_setup_done_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_nonce_setup_done_%=:\n\t"
         /* Load Nonce */
         "cmp	r12, #12\n\t"
         "bne	L_aes_gcm_encrypt_arm32_crypto_ghash_nonce_%=\n\t"
@@ -4346,12 +4344,12 @@ WC_OMIT_FRAME_POINTER void AES_GCM_encrypt_AARCH32(const byte* in, byte* out,
         "mov	r5, #1\n\t"
         "b	L_aes_gcm_encrypt_arm32_crypto_done_nonce_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_ghash_nonce_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_ghash_nonce_%=:\n\t"
         "lsr	r10, r12, #4\n\t"
         "cmp	r10, #0\n\t"
         "beq	L_aes_gcm_encrypt_arm32_crypto_nonce_done_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_nonce_start_1_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_nonce_start_1_%=:\n\t"
         "vld1.32	{q14}, [%[nonce]]!\n\t"
         "vmov.i8	q12, #0x55\n\t"
         "vshl.u8	q0, q14, #1\n\t"
@@ -4382,7 +4380,7 @@ WC_OMIT_FRAME_POINTER void AES_GCM_encrypt_AARCH32(const byte* in, byte* out,
         "subs	r10, r10, #1\n\t"
         "bne	L_aes_gcm_encrypt_arm32_crypto_nonce_start_1_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_nonce_done_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_nonce_done_%=:\n\t"
         "ands	r11, r12, #15\n\t"
         "beq	L_aes_gcm_encrypt_arm32_crypto_nonce_partial_done_%=\n\t"
         "veor.8	q0, q0, q0\n\t"
@@ -4391,21 +4389,21 @@ WC_OMIT_FRAME_POINTER void AES_GCM_encrypt_AARCH32(const byte* in, byte* out,
         "cmp	r12, #4\n\t"
         "blt	L_aes_gcm_encrypt_arm32_crypto_nonce_start_sw_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_nonce_start_dw_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_nonce_start_dw_%=:\n\t"
         "ldr	r8, [%[nonce]], #4\n\t"
         "sub	r12, r12, #4\n\t"
         "str	r8, [r9], #4\n\t"
         "cmp	r12, #4\n\t"
         "bge	L_aes_gcm_encrypt_arm32_crypto_nonce_start_dw_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_nonce_start_sw_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_nonce_start_sw_%=:\n\t"
         "cmp	r12, #2\n\t"
         "blt	L_aes_gcm_encrypt_arm32_crypto_nonce_start_byte_%=\n\t"
         "ldrh	r8, [%[nonce]], #2\n\t"
         "sub	r12, r12, #2\n\t"
         "strh	r8, [r9], #2\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_nonce_start_byte_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_nonce_start_byte_%=:\n\t"
         "cmp	r12, #1\n\t"
         "blt	L_aes_gcm_encrypt_arm32_crypto_nonce_end_bytes_%=\n\t"
         "ldrb	r8, [%[nonce]], #1\n\t"
@@ -4413,7 +4411,7 @@ WC_OMIT_FRAME_POINTER void AES_GCM_encrypt_AARCH32(const byte* in, byte* out,
         "strb	r8, [r9], #1\n\t"
         "bne	L_aes_gcm_encrypt_arm32_crypto_nonce_start_byte_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_nonce_end_bytes_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_nonce_end_bytes_%=:\n\t"
         "sub	r9, r9, r11\n\t"
         "vld1.32	{q14}, [r9]\n\t"
         "vmov.i8	q12, #0x55\n\t"
@@ -4443,7 +4441,7 @@ WC_OMIT_FRAME_POINTER void AES_GCM_encrypt_AARCH32(const byte* in, byte* out,
         "veor.8	q6, q6, q0\n\t"
         /* Done GHASH */
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_nonce_partial_done_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_nonce_partial_done_%=:\n\t"
         "veor.8	q0, q0, q0\n\t"
         /* nonceSz */
         "ldr	r12, [sp]\n\t"
@@ -4484,7 +4482,7 @@ WC_OMIT_FRAME_POINTER void AES_GCM_encrypt_AARCH32(const byte* in, byte* out,
         "vmov.32	s27, r5\n\t"
         "rev	r5, r5\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_done_nonce_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_done_nonce_%=:\n\t"
         "vldm.32	r7!, {q0-q3}\n\t"
         "vldm.32	r7!, {q7-q13}\n\t"
         /* nr */
@@ -4499,7 +4497,7 @@ WC_OMIT_FRAME_POINTER void AES_GCM_encrypt_AARCH32(const byte* in, byte* out,
         "blt	L_aes_gcm_encrypt_arm32_crypto_192_done_%=\n\t"
         "beq	L_aes_gcm_encrypt_arm32_crypto_192_start_1_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_192_start_2_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_192_start_2_%=:\n\t"
         "add	r8, r5, #1\n\t"
         "vmov.8	q4, q6\n\t"
         "add	r5, r5, #2\n\t"
@@ -4568,7 +4566,7 @@ WC_OMIT_FRAME_POINTER void AES_GCM_encrypt_AARCH32(const byte* in, byte* out,
         "bgt	L_aes_gcm_encrypt_arm32_crypto_192_start_2_%=\n\t"
         "blt	L_aes_gcm_encrypt_arm32_crypto_192_done_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_192_start_1_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_192_start_1_%=:\n\t"
         "add	r5, r5, #1\n\t"
         "vmov.8	q4, q6\n\t"
         "rev	r8, r5\n\t"
@@ -4604,7 +4602,7 @@ WC_OMIT_FRAME_POINTER void AES_GCM_encrypt_AARCH32(const byte* in, byte* out,
         "veor.8	q14, q14, q4\n\t"
         "vst1.32	{q14}, [%[out]]!\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_192_done_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_192_done_%=:\n\t"
         "ands	r11, %[sz], #15\n\t"
         "beq	L_aes_gcm_encrypt_arm32_crypto_192_partial_done_%=\n\t"
         "veor.8	q14, q14, q14\n\t"
@@ -4613,21 +4611,21 @@ WC_OMIT_FRAME_POINTER void AES_GCM_encrypt_AARCH32(const byte* in, byte* out,
         "cmp	r4, #4\n\t"
         "blt	L_aes_gcm_encrypt_arm32_crypto_192_start_sw_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_192_start_dw_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_192_start_dw_%=:\n\t"
         "ldr	lr, [%[in]], #4\n\t"
         "sub	r4, r4, #4\n\t"
         "str	lr, [r9], #4\n\t"
         "cmp	r4, #4\n\t"
         "bge	L_aes_gcm_encrypt_arm32_crypto_192_start_dw_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_192_start_sw_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_192_start_sw_%=:\n\t"
         "cmp	r4, #2\n\t"
         "blt	L_aes_gcm_encrypt_arm32_crypto_192_start_byte_%=\n\t"
         "ldrh	lr, [%[in]], #2\n\t"
         "sub	r4, r4, #2\n\t"
         "strh	lr, [r9], #2\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_192_start_byte_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_192_start_byte_%=:\n\t"
         "cmp	r4, #1\n\t"
         "blt	L_aes_gcm_encrypt_arm32_crypto_192_end_bytes_%=\n\t"
         "ldrb	lr, [%[in]], #1\n\t"
@@ -4635,7 +4633,7 @@ WC_OMIT_FRAME_POINTER void AES_GCM_encrypt_AARCH32(const byte* in, byte* out,
         "strb	lr, [r9], #1\n\t"
         "bne	L_aes_gcm_encrypt_arm32_crypto_192_start_byte_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_192_end_bytes_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_192_end_bytes_%=:\n\t"
         "sub	r9, r9, r11\n\t"
         "add	r5, r5, #1\n\t"
         "vmov.8	q4, q6\n\t"
@@ -4675,21 +4673,21 @@ WC_OMIT_FRAME_POINTER void AES_GCM_encrypt_AARCH32(const byte* in, byte* out,
         "cmp	r4, #4\n\t"
         "blt	L_aes_gcm_encrypt_arm32_crypto_192_out_start_sw_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_192_out_start_dw_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_192_out_start_dw_%=:\n\t"
         "ldr	lr, [r9], #4\n\t"
         "sub	r4, r4, #4\n\t"
         "str	lr, [%[out]], #4\n\t"
         "cmp	r4, #4\n\t"
         "bge	L_aes_gcm_encrypt_arm32_crypto_192_out_start_dw_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_192_out_start_sw_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_192_out_start_sw_%=:\n\t"
         "cmp	r4, #2\n\t"
         "blt	L_aes_gcm_encrypt_arm32_crypto_192_out_start_byte_%=\n\t"
         "ldrh	lr, [r9], #2\n\t"
         "sub	r4, r4, #2\n\t"
         "strh	lr, [%[out]], #2\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_192_out_start_byte_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_192_out_start_byte_%=:\n\t"
         "cmp	r4, #1\n\t"
         "blt	L_aes_gcm_encrypt_arm32_crypto_192_out_end_bytes_%=\n\t"
         "ldrb	lr, [r9], #1\n\t"
@@ -4697,9 +4695,9 @@ WC_OMIT_FRAME_POINTER void AES_GCM_encrypt_AARCH32(const byte* in, byte* out,
         "strb	lr, [%[out]], #1\n\t"
         "bne	L_aes_gcm_encrypt_arm32_crypto_192_out_start_byte_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_192_out_end_bytes_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_192_out_end_bytes_%=:\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_192_partial_done_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_192_partial_done_%=:\n\t"
         /* Finish */
         "add	r8, %[sz], #15\n\t"
         "sub	r8, r5, r8, lsr #4\n\t"
@@ -4736,13 +4734,13 @@ WC_OMIT_FRAME_POINTER void AES_GCM_encrypt_AARCH32(const byte* in, byte* out,
         "b	L_aes_gcm_encrypt_arm32_crypto_done_enc_%=\n\t"
         /* AES_GCM_256 */
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_start_256_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_start_256_%=:\n\t"
 #ifndef NO_AES_256
         "cmp	r10, #1\n\t"
         "blt	L_aes_gcm_encrypt_arm32_crypto_256_done_%=\n\t"
         "beq	L_aes_gcm_encrypt_arm32_crypto_256_start_1_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_256_start_2_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_256_start_2_%=:\n\t"
         "add	r8, r5, #1\n\t"
         "vmov.8	q4, q6\n\t"
         "add	r5, r5, #2\n\t"
@@ -4821,7 +4819,7 @@ WC_OMIT_FRAME_POINTER void AES_GCM_encrypt_AARCH32(const byte* in, byte* out,
         "bgt	L_aes_gcm_encrypt_arm32_crypto_256_start_2_%=\n\t"
         "blt	L_aes_gcm_encrypt_arm32_crypto_256_done_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_256_start_1_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_256_start_1_%=:\n\t"
         "add	r5, r5, #1\n\t"
         "vmov.8	q4, q6\n\t"
         "rev	r8, r5\n\t"
@@ -4863,7 +4861,7 @@ WC_OMIT_FRAME_POINTER void AES_GCM_encrypt_AARCH32(const byte* in, byte* out,
         "veor.8	q14, q14, q4\n\t"
         "vst1.32	{q14}, [%[out]]!\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_256_done_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_256_done_%=:\n\t"
         "ands	r11, %[sz], #15\n\t"
         "beq	L_aes_gcm_encrypt_arm32_crypto_256_partial_done_%=\n\t"
         "veor.8	q14, q14, q14\n\t"
@@ -4872,21 +4870,21 @@ WC_OMIT_FRAME_POINTER void AES_GCM_encrypt_AARCH32(const byte* in, byte* out,
         "cmp	r4, #4\n\t"
         "blt	L_aes_gcm_encrypt_arm32_crypto_256_start_sw_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_256_start_dw_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_256_start_dw_%=:\n\t"
         "ldr	lr, [%[in]], #4\n\t"
         "sub	r4, r4, #4\n\t"
         "str	lr, [r9], #4\n\t"
         "cmp	r4, #4\n\t"
         "bge	L_aes_gcm_encrypt_arm32_crypto_256_start_dw_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_256_start_sw_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_256_start_sw_%=:\n\t"
         "cmp	r4, #2\n\t"
         "blt	L_aes_gcm_encrypt_arm32_crypto_256_start_byte_%=\n\t"
         "ldrh	lr, [%[in]], #2\n\t"
         "sub	r4, r4, #2\n\t"
         "strh	lr, [r9], #2\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_256_start_byte_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_256_start_byte_%=:\n\t"
         "cmp	r4, #1\n\t"
         "blt	L_aes_gcm_encrypt_arm32_crypto_256_end_bytes_%=\n\t"
         "ldrb	lr, [%[in]], #1\n\t"
@@ -4894,7 +4892,7 @@ WC_OMIT_FRAME_POINTER void AES_GCM_encrypt_AARCH32(const byte* in, byte* out,
         "strb	lr, [r9], #1\n\t"
         "bne	L_aes_gcm_encrypt_arm32_crypto_256_start_byte_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_256_end_bytes_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_256_end_bytes_%=:\n\t"
         "sub	r9, r9, r11\n\t"
         "add	r5, r5, #1\n\t"
         "vmov.8	q4, q6\n\t"
@@ -4940,21 +4938,21 @@ WC_OMIT_FRAME_POINTER void AES_GCM_encrypt_AARCH32(const byte* in, byte* out,
         "cmp	r4, #4\n\t"
         "blt	L_aes_gcm_encrypt_arm32_crypto_256_out_start_sw_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_256_out_start_dw_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_256_out_start_dw_%=:\n\t"
         "ldr	lr, [r9], #4\n\t"
         "sub	r4, r4, #4\n\t"
         "str	lr, [%[out]], #4\n\t"
         "cmp	r4, #4\n\t"
         "bge	L_aes_gcm_encrypt_arm32_crypto_256_out_start_dw_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_256_out_start_sw_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_256_out_start_sw_%=:\n\t"
         "cmp	r4, #2\n\t"
         "blt	L_aes_gcm_encrypt_arm32_crypto_256_out_start_byte_%=\n\t"
         "ldrh	lr, [r9], #2\n\t"
         "sub	r4, r4, #2\n\t"
         "strh	lr, [%[out]], #2\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_256_out_start_byte_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_256_out_start_byte_%=:\n\t"
         "cmp	r4, #1\n\t"
         "blt	L_aes_gcm_encrypt_arm32_crypto_256_out_end_bytes_%=\n\t"
         "ldrb	lr, [r9], #1\n\t"
@@ -4962,9 +4960,9 @@ WC_OMIT_FRAME_POINTER void AES_GCM_encrypt_AARCH32(const byte* in, byte* out,
         "strb	lr, [%[out]], #1\n\t"
         "bne	L_aes_gcm_encrypt_arm32_crypto_256_out_start_byte_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_256_out_end_bytes_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_256_out_end_bytes_%=:\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_256_partial_done_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_256_partial_done_%=:\n\t"
         /* Finish */
         "add	r8, %[sz], #15\n\t"
         "sub	r8, r5, r8, lsr #4\n\t"
@@ -5007,13 +5005,13 @@ WC_OMIT_FRAME_POINTER void AES_GCM_encrypt_AARCH32(const byte* in, byte* out,
         "b	L_aes_gcm_encrypt_arm32_crypto_done_enc_%=\n\t"
         /* AES_GCM_128 */
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_start_128_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_start_128_%=:\n\t"
 #ifndef NO_AES_128
         "cmp	r10, #1\n\t"
         "blt	L_aes_gcm_encrypt_arm32_crypto_128_done_%=\n\t"
         "beq	L_aes_gcm_encrypt_arm32_crypto_128_start_1_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_128_start_2_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_128_start_2_%=:\n\t"
         "add	r8, r5, #1\n\t"
         "vmov.8	q4, q6\n\t"
         "add	r5, r5, #2\n\t"
@@ -5071,7 +5069,7 @@ WC_OMIT_FRAME_POINTER void AES_GCM_encrypt_AARCH32(const byte* in, byte* out,
         "bgt	L_aes_gcm_encrypt_arm32_crypto_128_start_2_%=\n\t"
         "blt	L_aes_gcm_encrypt_arm32_crypto_128_done_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_128_start_1_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_128_start_1_%=:\n\t"
         "add	r5, r5, #1\n\t"
         "vmov.8	q4, q6\n\t"
         "rev	r8, r5\n\t"
@@ -5100,7 +5098,7 @@ WC_OMIT_FRAME_POINTER void AES_GCM_encrypt_AARCH32(const byte* in, byte* out,
         "veor.8	q14, q14, q4\n\t"
         "vst1.32	{q14}, [%[out]]!\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_128_done_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_128_done_%=:\n\t"
         "ands	r11, %[sz], #15\n\t"
         "beq	L_aes_gcm_encrypt_arm32_crypto_128_partial_done_%=\n\t"
         "veor.8	q14, q14, q14\n\t"
@@ -5109,21 +5107,21 @@ WC_OMIT_FRAME_POINTER void AES_GCM_encrypt_AARCH32(const byte* in, byte* out,
         "cmp	r4, #4\n\t"
         "blt	L_aes_gcm_encrypt_arm32_crypto_128_start_sw_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_128_start_dw_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_128_start_dw_%=:\n\t"
         "ldr	lr, [%[in]], #4\n\t"
         "sub	r4, r4, #4\n\t"
         "str	lr, [r9], #4\n\t"
         "cmp	r4, #4\n\t"
         "bge	L_aes_gcm_encrypt_arm32_crypto_128_start_dw_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_128_start_sw_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_128_start_sw_%=:\n\t"
         "cmp	r4, #2\n\t"
         "blt	L_aes_gcm_encrypt_arm32_crypto_128_start_byte_%=\n\t"
         "ldrh	lr, [%[in]], #2\n\t"
         "sub	r4, r4, #2\n\t"
         "strh	lr, [r9], #2\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_128_start_byte_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_128_start_byte_%=:\n\t"
         "cmp	r4, #1\n\t"
         "blt	L_aes_gcm_encrypt_arm32_crypto_128_end_bytes_%=\n\t"
         "ldrb	lr, [%[in]], #1\n\t"
@@ -5131,7 +5129,7 @@ WC_OMIT_FRAME_POINTER void AES_GCM_encrypt_AARCH32(const byte* in, byte* out,
         "strb	lr, [r9], #1\n\t"
         "bne	L_aes_gcm_encrypt_arm32_crypto_128_start_byte_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_128_end_bytes_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_128_end_bytes_%=:\n\t"
         "sub	r9, r9, r11\n\t"
         "add	r5, r5, #1\n\t"
         "vmov.8	q4, q6\n\t"
@@ -5164,21 +5162,21 @@ WC_OMIT_FRAME_POINTER void AES_GCM_encrypt_AARCH32(const byte* in, byte* out,
         "cmp	r4, #4\n\t"
         "blt	L_aes_gcm_encrypt_arm32_crypto_128_out_start_sw_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_128_out_start_dw_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_128_out_start_dw_%=:\n\t"
         "ldr	lr, [r9], #4\n\t"
         "sub	r4, r4, #4\n\t"
         "str	lr, [%[out]], #4\n\t"
         "cmp	r4, #4\n\t"
         "bge	L_aes_gcm_encrypt_arm32_crypto_128_out_start_dw_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_128_out_start_sw_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_128_out_start_sw_%=:\n\t"
         "cmp	r4, #2\n\t"
         "blt	L_aes_gcm_encrypt_arm32_crypto_128_out_start_byte_%=\n\t"
         "ldrh	lr, [r9], #2\n\t"
         "sub	r4, r4, #2\n\t"
         "strh	lr, [%[out]], #2\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_128_out_start_byte_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_128_out_start_byte_%=:\n\t"
         "cmp	r4, #1\n\t"
         "blt	L_aes_gcm_encrypt_arm32_crypto_128_out_end_bytes_%=\n\t"
         "ldrb	lr, [r9], #1\n\t"
@@ -5186,9 +5184,9 @@ WC_OMIT_FRAME_POINTER void AES_GCM_encrypt_AARCH32(const byte* in, byte* out,
         "strb	lr, [%[out]], #1\n\t"
         "bne	L_aes_gcm_encrypt_arm32_crypto_128_out_start_byte_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_128_out_end_bytes_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_128_out_end_bytes_%=:\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_128_partial_done_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_128_partial_done_%=:\n\t"
         /* Finish */
         "add	r8, %[sz], #15\n\t"
         "sub	r8, r5, r8, lsr #4\n\t"
@@ -5216,7 +5214,7 @@ WC_OMIT_FRAME_POINTER void AES_GCM_encrypt_AARCH32(const byte* in, byte* out,
         "veor.8	q6, q6, q13\n\t"
 #endif /* !NO_AES_128 */
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_done_enc_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_done_enc_%=:\n\t"
         /* aadSz */
         "ldr	r6, [sp, #16]\n\t"
         /* gcm_h */
@@ -5263,7 +5261,7 @@ WC_OMIT_FRAME_POINTER void AES_GCM_encrypt_AARCH32(const byte* in, byte* out,
         "veor.8	q11, q11, q0\n\t"
         /* Done */
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_h_done_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_h_done_%=:\n\t"
         /* aad */
         "ldr	r5, [sp, #12]\n\t"
         "lsr	r10, r6, #4\n\t"
@@ -5273,7 +5271,7 @@ WC_OMIT_FRAME_POINTER void AES_GCM_encrypt_AARCH32(const byte* in, byte* out,
         "cmp	r10, #4\n\t"
         "blt	L_aes_gcm_encrypt_arm32_crypto_aad_start_2_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_aad_start_4_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_aad_start_4_%=:\n\t"
         "vldm	r5!, {q0-q2}\n\t"
         "vmov.i8	q12, #0x55\n\t"
         "vmov.i8	q5, #51\n\t"
@@ -5365,7 +5363,7 @@ WC_OMIT_FRAME_POINTER void AES_GCM_encrypt_AARCH32(const byte* in, byte* out,
         "blt	L_aes_gcm_encrypt_arm32_crypto_aad_done_%=\n\t"
         "beq	L_aes_gcm_encrypt_arm32_crypto_aad_start_1_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_aad_start_2_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_aad_start_2_%=:\n\t"
         "vld1.32	{q14-q15}, [r5]!\n\t"
         "vmov.i8	q12, #0x55\n\t"
         "vshl.u8	q0, q14, #1\n\t"
@@ -5414,7 +5412,7 @@ WC_OMIT_FRAME_POINTER void AES_GCM_encrypt_AARCH32(const byte* in, byte* out,
         "cmp	r10, #0\n\t"
         "beq	L_aes_gcm_encrypt_arm32_crypto_aad_done_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_aad_start_1_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_aad_start_1_%=:\n\t"
         "vld1.32	{q14}, [r5]!\n\t"
         "vmov.i8	q12, #0x55\n\t"
         "vshl.u8	q0, q14, #1\n\t"
@@ -5443,7 +5441,7 @@ WC_OMIT_FRAME_POINTER void AES_GCM_encrypt_AARCH32(const byte* in, byte* out,
         "veor.8	q7, q7, q0\n\t"
         /* Done GHASH */
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_aad_done_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_aad_done_%=:\n\t"
         "ands	r11, r6, #15\n\t"
         "beq	L_aes_gcm_encrypt_arm32_crypto_aad_partial_done_%=\n\t"
         "veor.8	q0, q0, q0\n\t"
@@ -5452,21 +5450,21 @@ WC_OMIT_FRAME_POINTER void AES_GCM_encrypt_AARCH32(const byte* in, byte* out,
         "cmp	r12, #4\n\t"
         "blt	L_aes_gcm_encrypt_arm32_crypto_aad_start_sw_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_aad_start_dw_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_aad_start_dw_%=:\n\t"
         "ldr	r8, [r5], #4\n\t"
         "sub	r12, r12, #4\n\t"
         "str	r8, [r9], #4\n\t"
         "cmp	r12, #4\n\t"
         "bge	L_aes_gcm_encrypt_arm32_crypto_aad_start_dw_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_aad_start_sw_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_aad_start_sw_%=:\n\t"
         "cmp	r12, #2\n\t"
         "blt	L_aes_gcm_encrypt_arm32_crypto_aad_start_byte_%=\n\t"
         "ldrh	r8, [r5], #2\n\t"
         "sub	r12, r12, #2\n\t"
         "strh	r8, [r9], #2\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_aad_start_byte_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_aad_start_byte_%=:\n\t"
         "cmp	r12, #1\n\t"
         "blt	L_aes_gcm_encrypt_arm32_crypto_aad_end_bytes_%=\n\t"
         "ldrb	r8, [r5], #1\n\t"
@@ -5474,7 +5472,7 @@ WC_OMIT_FRAME_POINTER void AES_GCM_encrypt_AARCH32(const byte* in, byte* out,
         "strb	r8, [r9], #1\n\t"
         "bne	L_aes_gcm_encrypt_arm32_crypto_aad_start_byte_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_aad_end_bytes_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_aad_end_bytes_%=:\n\t"
         "sub	r9, r9, r11\n\t"
         "vld1.32	{q14}, [r9]\n\t"
         "vmov.i8	q12, #0x55\n\t"
@@ -5504,7 +5502,7 @@ WC_OMIT_FRAME_POINTER void AES_GCM_encrypt_AARCH32(const byte* in, byte* out,
         "veor.8	q7, q7, q0\n\t"
         /* Done GHASH */
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_aad_partial_done_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_aad_partial_done_%=:\n\t"
         /* out */
         "lsr	r10, %[sz], #4\n\t"
         "cmp	r10, #1\n\t"
@@ -5513,7 +5511,7 @@ WC_OMIT_FRAME_POINTER void AES_GCM_encrypt_AARCH32(const byte* in, byte* out,
         "cmp	r10, #4\n\t"
         "blt	L_aes_gcm_encrypt_arm32_crypto_out_start_2_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_out_start_4_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_out_start_4_%=:\n\t"
         "vldm	%[out]!, {q0-q2}\n\t"
         "vmov.i8	q12, #0x55\n\t"
         "vmov.i8	q5, #51\n\t"
@@ -5605,7 +5603,7 @@ WC_OMIT_FRAME_POINTER void AES_GCM_encrypt_AARCH32(const byte* in, byte* out,
         "blt	L_aes_gcm_encrypt_arm32_crypto_out_done_%=\n\t"
         "beq	L_aes_gcm_encrypt_arm32_crypto_out_start_1_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_out_start_2_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_out_start_2_%=:\n\t"
         "vld1.32	{q14-q15}, [%[out]]!\n\t"
         "vmov.i8	q12, #0x55\n\t"
         "vshl.u8	q0, q14, #1\n\t"
@@ -5654,7 +5652,7 @@ WC_OMIT_FRAME_POINTER void AES_GCM_encrypt_AARCH32(const byte* in, byte* out,
         "cmp	r10, #0\n\t"
         "beq	L_aes_gcm_encrypt_arm32_crypto_out_done_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_out_start_1_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_out_start_1_%=:\n\t"
         "vld1.32	{q14}, [%[out]]!\n\t"
         "vmov.i8	q12, #0x55\n\t"
         "vshl.u8	q0, q14, #1\n\t"
@@ -5683,7 +5681,7 @@ WC_OMIT_FRAME_POINTER void AES_GCM_encrypt_AARCH32(const byte* in, byte* out,
         "veor.8	q7, q7, q0\n\t"
         /* Done GHASH */
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_out_done_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_out_done_%=:\n\t"
         "ands	r11, %[sz], #15\n\t"
         "beq	L_aes_gcm_encrypt_arm32_crypto_out_partial_done_%=\n\t"
         "veor.8	q0, q0, q0\n\t"
@@ -5692,21 +5690,21 @@ WC_OMIT_FRAME_POINTER void AES_GCM_encrypt_AARCH32(const byte* in, byte* out,
         "cmp	r12, #4\n\t"
         "blt	L_aes_gcm_encrypt_arm32_crypto_out_start_sw_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_out_start_dw_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_out_start_dw_%=:\n\t"
         "ldr	r8, [%[out]], #4\n\t"
         "sub	r12, r12, #4\n\t"
         "str	r8, [r9], #4\n\t"
         "cmp	r12, #4\n\t"
         "bge	L_aes_gcm_encrypt_arm32_crypto_out_start_dw_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_out_start_sw_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_out_start_sw_%=:\n\t"
         "cmp	r12, #2\n\t"
         "blt	L_aes_gcm_encrypt_arm32_crypto_out_start_byte_%=\n\t"
         "ldrh	r8, [%[out]], #2\n\t"
         "sub	r12, r12, #2\n\t"
         "strh	r8, [r9], #2\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_out_start_byte_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_out_start_byte_%=:\n\t"
         "cmp	r12, #1\n\t"
         "blt	L_aes_gcm_encrypt_arm32_crypto_out_end_bytes_%=\n\t"
         "ldrb	r8, [%[out]], #1\n\t"
@@ -5714,7 +5712,7 @@ WC_OMIT_FRAME_POINTER void AES_GCM_encrypt_AARCH32(const byte* in, byte* out,
         "strb	r8, [r9], #1\n\t"
         "bne	L_aes_gcm_encrypt_arm32_crypto_out_start_byte_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_out_end_bytes_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_out_end_bytes_%=:\n\t"
         "sub	r9, r9, r11\n\t"
         "vld1.32	{q14}, [r9]\n\t"
         "vmov.i8	q12, #0x55\n\t"
@@ -5744,7 +5742,7 @@ WC_OMIT_FRAME_POINTER void AES_GCM_encrypt_AARCH32(const byte* in, byte* out,
         "veor.8	q7, q7, q0\n\t"
         /* Done GHASH */
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_out_partial_done_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_out_partial_done_%=:\n\t"
         "lsr	lr, r6, #29\n\t"
         "lsl	r6, r6, #3\n\t"
         "rbit	lr, lr\n\t"
@@ -5792,26 +5790,26 @@ WC_OMIT_FRAME_POINTER void AES_GCM_encrypt_AARCH32(const byte* in, byte* out,
         "vst1.8	{q7}, [lr]\n\t"
         "b	L_aes_gcm_encrypt_arm32_crypto_done_gcm_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_tag_tag_partial_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_tag_tag_partial_%=:\n\t"
         "vst1.8	{q7}, [r9]\n\t"
         "cmp	r4, #4\n\t"
         "blt	L_aes_gcm_encrypt_arm32_crypto_tag_tag_start_sw_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_tag_tag_start_dw_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_tag_tag_start_dw_%=:\n\t"
         "ldr	r8, [r9], #4\n\t"
         "sub	r4, r4, #4\n\t"
         "str	r8, [lr], #4\n\t"
         "cmp	r4, #4\n\t"
         "bge	L_aes_gcm_encrypt_arm32_crypto_tag_tag_start_dw_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_tag_tag_start_sw_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_tag_tag_start_sw_%=:\n\t"
         "cmp	r4, #2\n\t"
         "blt	L_aes_gcm_encrypt_arm32_crypto_tag_tag_start_byte_%=\n\t"
         "ldrh	r8, [r9], #2\n\t"
         "sub	r4, r4, #2\n\t"
         "strh	r8, [lr], #2\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_tag_tag_start_byte_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_tag_tag_start_byte_%=:\n\t"
         "cmp	r4, #1\n\t"
         "blt	L_aes_gcm_encrypt_arm32_crypto_tag_tag_end_bytes_%=\n\t"
         "ldrb	r8, [r9], #1\n\t"
@@ -5819,9 +5817,9 @@ WC_OMIT_FRAME_POINTER void AES_GCM_encrypt_AARCH32(const byte* in, byte* out,
         "strb	r8, [lr], #1\n\t"
         "bne	L_aes_gcm_encrypt_arm32_crypto_tag_tag_start_byte_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_tag_tag_end_bytes_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_tag_tag_end_bytes_%=:\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm32_crypto_done_gcm_%=: \n\t"
+    "L_aes_gcm_encrypt_arm32_crypto_done_gcm_%=:\n\t"
         "pop	{%[nonceSz], %[tag]}\n\t"
         "pop	{%[tagSz], %[aad], %[aadSz], %[key], %[gcm_h], %[tmp], %[reg], %[nr]}\n\t"
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
@@ -5854,23 +5852,23 @@ WC_OMIT_FRAME_POINTER int AES_GCM_decrypt_AARCH32(const byte* in, byte* out,
     word32 sz, const byte* nonce, word32 nonceSz, const byte* tag, word32 tagSz,
     const byte* aad, word32 aadSz, byte* key, byte* gcm_h, byte* tmp, byte* reg,
     int nr)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register const byte* in asm ("r0") = (const byte*)in_p;
-    register byte* out asm ("r1") = (byte*)out_p;
-    register word32 sz asm ("r2") = (word32)sz_p;
-    register const byte* nonce asm ("r3") = (const byte*)nonce_p;
-    register word32 nonceSz asm ("r12") = (word32)nonceSz_p;
-    register const byte* tag asm ("lr") = (const byte*)tag_p;
-    register word32 tagSz asm ("r4") = (word32)tagSz_p;
-    register const byte* aad asm ("r5") = (const byte*)aad_p;
-    register word32 aadSz asm ("r6") = (word32)aadSz_p;
-    register byte* key asm ("r7") = (byte*)key_p;
-    register byte* gcm_h asm ("r8") = (byte*)gcm_h_p;
-    register byte* tmp asm ("r9") = (byte*)tmp_p;
-    register byte* reg asm ("r10") = (byte*)reg_p;
-    register int nr asm ("r11") = (int)nr_p;
+    register const byte* in __asm__ ("r0") = (const byte*)in_p;
+    register byte* out __asm__ ("r1") = (byte*)out_p;
+    register word32 sz __asm__ ("r2") = (word32)sz_p;
+    register const byte* nonce __asm__ ("r3") = (const byte*)nonce_p;
+    register word32 nonceSz __asm__ ("r12") = (word32)nonceSz_p;
+    register const byte* tag __asm__ ("lr") = (const byte*)tag_p;
+    register word32 tagSz __asm__ ("r4") = (word32)tagSz_p;
+    register const byte* aad __asm__ ("r5") = (const byte*)aad_p;
+    register word32 aadSz __asm__ ("r6") = (word32)aadSz_p;
+    register byte* key __asm__ ("r7") = (byte*)key_p;
+    register byte* gcm_h __asm__ ("r8") = (byte*)gcm_h_p;
+    register byte* tmp __asm__ ("r9") = (byte*)tmp_p;
+    register byte* reg __asm__ ("r10") = (byte*)reg_p;
+    register int nr __asm__ ("r11") = (int)nr_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -5925,7 +5923,7 @@ WC_OMIT_FRAME_POINTER int AES_GCM_decrypt_AARCH32(const byte* in, byte* out,
         "veor.8	q11, q11, q0\n\t"
         /* Done */
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_h_done_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_h_done_%=:\n\t"
         /* aad */
         "ldr	r5, [sp, #12]\n\t"
         "lsr	r10, r6, #4\n\t"
@@ -5935,7 +5933,7 @@ WC_OMIT_FRAME_POINTER int AES_GCM_decrypt_AARCH32(const byte* in, byte* out,
         "cmp	r10, #4\n\t"
         "blt	L_aes_gcm_decrypt_arm32_crypto_aad_start_2_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_aad_start_4_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_aad_start_4_%=:\n\t"
         "vldm	r5!, {q0-q2}\n\t"
         "vmov.i8	q12, #0x55\n\t"
         "vmov.i8	q5, #51\n\t"
@@ -6027,7 +6025,7 @@ WC_OMIT_FRAME_POINTER int AES_GCM_decrypt_AARCH32(const byte* in, byte* out,
         "blt	L_aes_gcm_decrypt_arm32_crypto_aad_done_%=\n\t"
         "beq	L_aes_gcm_decrypt_arm32_crypto_aad_start_1_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_aad_start_2_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_aad_start_2_%=:\n\t"
         "vld1.32	{q14-q15}, [r5]!\n\t"
         "vmov.i8	q12, #0x55\n\t"
         "vshl.u8	q0, q14, #1\n\t"
@@ -6076,7 +6074,7 @@ WC_OMIT_FRAME_POINTER int AES_GCM_decrypt_AARCH32(const byte* in, byte* out,
         "cmp	r10, #0\n\t"
         "beq	L_aes_gcm_decrypt_arm32_crypto_aad_done_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_aad_start_1_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_aad_start_1_%=:\n\t"
         "vld1.32	{q14}, [r5]!\n\t"
         "vmov.i8	q12, #0x55\n\t"
         "vshl.u8	q0, q14, #1\n\t"
@@ -6105,7 +6103,7 @@ WC_OMIT_FRAME_POINTER int AES_GCM_decrypt_AARCH32(const byte* in, byte* out,
         "veor.8	q7, q7, q0\n\t"
         /* Done GHASH */
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_aad_done_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_aad_done_%=:\n\t"
         "ands	r11, r6, #15\n\t"
         "beq	L_aes_gcm_decrypt_arm32_crypto_aad_partial_done_%=\n\t"
         "veor.8	q0, q0, q0\n\t"
@@ -6114,21 +6112,21 @@ WC_OMIT_FRAME_POINTER int AES_GCM_decrypt_AARCH32(const byte* in, byte* out,
         "cmp	r12, #4\n\t"
         "blt	L_aes_gcm_decrypt_arm32_crypto_aad_start_sw_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_aad_start_dw_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_aad_start_dw_%=:\n\t"
         "ldr	r8, [r5], #4\n\t"
         "sub	r12, r12, #4\n\t"
         "str	r8, [r9], #4\n\t"
         "cmp	r12, #4\n\t"
         "bge	L_aes_gcm_decrypt_arm32_crypto_aad_start_dw_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_aad_start_sw_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_aad_start_sw_%=:\n\t"
         "cmp	r12, #2\n\t"
         "blt	L_aes_gcm_decrypt_arm32_crypto_aad_start_byte_%=\n\t"
         "ldrh	r8, [r5], #2\n\t"
         "sub	r12, r12, #2\n\t"
         "strh	r8, [r9], #2\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_aad_start_byte_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_aad_start_byte_%=:\n\t"
         "cmp	r12, #1\n\t"
         "blt	L_aes_gcm_decrypt_arm32_crypto_aad_end_bytes_%=\n\t"
         "ldrb	r8, [r5], #1\n\t"
@@ -6136,7 +6134,7 @@ WC_OMIT_FRAME_POINTER int AES_GCM_decrypt_AARCH32(const byte* in, byte* out,
         "strb	r8, [r9], #1\n\t"
         "bne	L_aes_gcm_decrypt_arm32_crypto_aad_start_byte_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_aad_end_bytes_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_aad_end_bytes_%=:\n\t"
         "sub	r9, r9, r11\n\t"
         "vld1.32	{q14}, [r9]\n\t"
         "vmov.i8	q12, #0x55\n\t"
@@ -6166,7 +6164,7 @@ WC_OMIT_FRAME_POINTER int AES_GCM_decrypt_AARCH32(const byte* in, byte* out,
         "veor.8	q7, q7, q0\n\t"
         /* Done GHASH */
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_aad_partial_done_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_aad_partial_done_%=:\n\t"
         /* in */
         "lsr	r10, %[sz], #4\n\t"
         "cmp	r10, #1\n\t"
@@ -6175,7 +6173,7 @@ WC_OMIT_FRAME_POINTER int AES_GCM_decrypt_AARCH32(const byte* in, byte* out,
         "cmp	r10, #4\n\t"
         "blt	L_aes_gcm_decrypt_arm32_crypto_in_start_2_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_in_start_4_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_in_start_4_%=:\n\t"
         "vldm	%[in]!, {q0-q2}\n\t"
         "vmov.i8	q12, #0x55\n\t"
         "vmov.i8	q5, #51\n\t"
@@ -6267,7 +6265,7 @@ WC_OMIT_FRAME_POINTER int AES_GCM_decrypt_AARCH32(const byte* in, byte* out,
         "blt	L_aes_gcm_decrypt_arm32_crypto_in_done_%=\n\t"
         "beq	L_aes_gcm_decrypt_arm32_crypto_in_start_1_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_in_start_2_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_in_start_2_%=:\n\t"
         "vld1.32	{q14-q15}, [%[in]]!\n\t"
         "vmov.i8	q12, #0x55\n\t"
         "vshl.u8	q0, q14, #1\n\t"
@@ -6316,7 +6314,7 @@ WC_OMIT_FRAME_POINTER int AES_GCM_decrypt_AARCH32(const byte* in, byte* out,
         "cmp	r10, #0\n\t"
         "beq	L_aes_gcm_decrypt_arm32_crypto_in_done_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_in_start_1_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_in_start_1_%=:\n\t"
         "vld1.32	{q14}, [%[in]]!\n\t"
         "vmov.i8	q12, #0x55\n\t"
         "vshl.u8	q0, q14, #1\n\t"
@@ -6345,7 +6343,7 @@ WC_OMIT_FRAME_POINTER int AES_GCM_decrypt_AARCH32(const byte* in, byte* out,
         "veor.8	q7, q7, q0\n\t"
         /* Done GHASH */
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_in_done_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_in_done_%=:\n\t"
         "ands	r11, %[sz], #15\n\t"
         "beq	L_aes_gcm_decrypt_arm32_crypto_in_partial_done_%=\n\t"
         "veor.8	q0, q0, q0\n\t"
@@ -6354,21 +6352,21 @@ WC_OMIT_FRAME_POINTER int AES_GCM_decrypt_AARCH32(const byte* in, byte* out,
         "cmp	r12, #4\n\t"
         "blt	L_aes_gcm_decrypt_arm32_crypto_in_start_sw_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_in_start_dw_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_in_start_dw_%=:\n\t"
         "ldr	r8, [%[in]], #4\n\t"
         "sub	r12, r12, #4\n\t"
         "str	r8, [r9], #4\n\t"
         "cmp	r12, #4\n\t"
         "bge	L_aes_gcm_decrypt_arm32_crypto_in_start_dw_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_in_start_sw_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_in_start_sw_%=:\n\t"
         "cmp	r12, #2\n\t"
         "blt	L_aes_gcm_decrypt_arm32_crypto_in_start_byte_%=\n\t"
         "ldrh	r8, [%[in]], #2\n\t"
         "sub	r12, r12, #2\n\t"
         "strh	r8, [r9], #2\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_in_start_byte_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_in_start_byte_%=:\n\t"
         "cmp	r12, #1\n\t"
         "blt	L_aes_gcm_decrypt_arm32_crypto_in_end_bytes_%=\n\t"
         "ldrb	r8, [%[in]], #1\n\t"
@@ -6376,7 +6374,7 @@ WC_OMIT_FRAME_POINTER int AES_GCM_decrypt_AARCH32(const byte* in, byte* out,
         "strb	r8, [r9], #1\n\t"
         "bne	L_aes_gcm_decrypt_arm32_crypto_in_start_byte_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_in_end_bytes_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_in_end_bytes_%=:\n\t"
         "sub	r9, r9, r11\n\t"
         "vld1.32	{q14}, [r9]\n\t"
         "vmov.i8	q12, #0x55\n\t"
@@ -6406,10 +6404,10 @@ WC_OMIT_FRAME_POINTER int AES_GCM_decrypt_AARCH32(const byte* in, byte* out,
         "veor.8	q7, q7, q0\n\t"
         /* Done GHASH */
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_in_partial_done_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_in_partial_done_%=:\n\t"
         "sub	%[in], %[in], %[sz]\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_done_gcm_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_done_gcm_%=:\n\t"
         /* nonceSz */
         "ldr	r12, [sp]\n\t"
         /* Load Nonce */
@@ -6425,12 +6423,12 @@ WC_OMIT_FRAME_POINTER int AES_GCM_decrypt_AARCH32(const byte* in, byte* out,
         "mov	r5, #1\n\t"
         "b	L_aes_gcm_decrypt_arm32_crypto_done_nonce_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_ghash_nonce_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_ghash_nonce_%=:\n\t"
         "lsr	r10, r12, #4\n\t"
         "cmp	r10, #0\n\t"
         "beq	L_aes_gcm_decrypt_arm32_crypto_nonce_done_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_nonce_start_1_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_nonce_start_1_%=:\n\t"
         "vld1.32	{q14}, [%[nonce]]!\n\t"
         "vmov.i8	q12, #0x55\n\t"
         "vshl.u8	q0, q14, #1\n\t"
@@ -6461,7 +6459,7 @@ WC_OMIT_FRAME_POINTER int AES_GCM_decrypt_AARCH32(const byte* in, byte* out,
         "subs	r10, r10, #1\n\t"
         "bne	L_aes_gcm_decrypt_arm32_crypto_nonce_start_1_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_nonce_done_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_nonce_done_%=:\n\t"
         "ands	r11, r12, #15\n\t"
         "beq	L_aes_gcm_decrypt_arm32_crypto_nonce_partial_done_%=\n\t"
         "veor.8	q0, q0, q0\n\t"
@@ -6470,21 +6468,21 @@ WC_OMIT_FRAME_POINTER int AES_GCM_decrypt_AARCH32(const byte* in, byte* out,
         "cmp	r12, #4\n\t"
         "blt	L_aes_gcm_decrypt_arm32_crypto_nonce_start_sw_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_nonce_start_dw_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_nonce_start_dw_%=:\n\t"
         "ldr	r8, [%[nonce]], #4\n\t"
         "sub	r12, r12, #4\n\t"
         "str	r8, [r9], #4\n\t"
         "cmp	r12, #4\n\t"
         "bge	L_aes_gcm_decrypt_arm32_crypto_nonce_start_dw_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_nonce_start_sw_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_nonce_start_sw_%=:\n\t"
         "cmp	r12, #2\n\t"
         "blt	L_aes_gcm_decrypt_arm32_crypto_nonce_start_byte_%=\n\t"
         "ldrh	r8, [%[nonce]], #2\n\t"
         "sub	r12, r12, #2\n\t"
         "strh	r8, [r9], #2\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_nonce_start_byte_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_nonce_start_byte_%=:\n\t"
         "cmp	r12, #1\n\t"
         "blt	L_aes_gcm_decrypt_arm32_crypto_nonce_end_bytes_%=\n\t"
         "ldrb	r8, [%[nonce]], #1\n\t"
@@ -6492,7 +6490,7 @@ WC_OMIT_FRAME_POINTER int AES_GCM_decrypt_AARCH32(const byte* in, byte* out,
         "strb	r8, [r9], #1\n\t"
         "bne	L_aes_gcm_decrypt_arm32_crypto_nonce_start_byte_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_nonce_end_bytes_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_nonce_end_bytes_%=:\n\t"
         "sub	r9, r9, r11\n\t"
         "vld1.32	{q14}, [r9]\n\t"
         "vmov.i8	q12, #0x55\n\t"
@@ -6522,7 +6520,7 @@ WC_OMIT_FRAME_POINTER int AES_GCM_decrypt_AARCH32(const byte* in, byte* out,
         "veor.8	q6, q6, q0\n\t"
         /* Done GHASH */
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_nonce_partial_done_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_nonce_partial_done_%=:\n\t"
         "veor.8	q0, q0, q0\n\t"
         /* nonceSz */
         "ldr	r12, [sp]\n\t"
@@ -6563,7 +6561,7 @@ WC_OMIT_FRAME_POINTER int AES_GCM_decrypt_AARCH32(const byte* in, byte* out,
         "vmov.32	s27, r5\n\t"
         "rev	r5, r5\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_done_nonce_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_done_nonce_%=:\n\t"
         /* reg */
         "ldr	r9, [sp, #32]\n\t"
         "vst1.32	{q7}, [r9]\n\t"
@@ -6584,7 +6582,7 @@ WC_OMIT_FRAME_POINTER int AES_GCM_decrypt_AARCH32(const byte* in, byte* out,
         "blt	L_aes_gcm_decrypt_arm32_crypto_192_done_%=\n\t"
         "beq	L_aes_gcm_decrypt_arm32_crypto_192_start_1_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_192_start_2_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_192_start_2_%=:\n\t"
         "add	r8, r5, #1\n\t"
         "vmov.8	q4, q6\n\t"
         "add	r5, r5, #2\n\t"
@@ -6653,7 +6651,7 @@ WC_OMIT_FRAME_POINTER int AES_GCM_decrypt_AARCH32(const byte* in, byte* out,
         "bgt	L_aes_gcm_decrypt_arm32_crypto_192_start_2_%=\n\t"
         "blt	L_aes_gcm_decrypt_arm32_crypto_192_done_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_192_start_1_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_192_start_1_%=:\n\t"
         "add	r5, r5, #1\n\t"
         "vmov.8	q4, q6\n\t"
         "rev	r8, r5\n\t"
@@ -6689,7 +6687,7 @@ WC_OMIT_FRAME_POINTER int AES_GCM_decrypt_AARCH32(const byte* in, byte* out,
         "veor.8	q14, q14, q4\n\t"
         "vst1.32	{q14}, [%[out]]!\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_192_done_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_192_done_%=:\n\t"
         "ands	r11, %[sz], #15\n\t"
         "beq	L_aes_gcm_decrypt_arm32_crypto_192_partial_done_%=\n\t"
         "veor.8	q14, q14, q14\n\t"
@@ -6698,21 +6696,21 @@ WC_OMIT_FRAME_POINTER int AES_GCM_decrypt_AARCH32(const byte* in, byte* out,
         "cmp	r4, #4\n\t"
         "blt	L_aes_gcm_decrypt_arm32_crypto_192_start_sw_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_192_start_dw_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_192_start_dw_%=:\n\t"
         "ldr	lr, [%[in]], #4\n\t"
         "sub	r4, r4, #4\n\t"
         "str	lr, [r9], #4\n\t"
         "cmp	r4, #4\n\t"
         "bge	L_aes_gcm_decrypt_arm32_crypto_192_start_dw_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_192_start_sw_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_192_start_sw_%=:\n\t"
         "cmp	r4, #2\n\t"
         "blt	L_aes_gcm_decrypt_arm32_crypto_192_start_byte_%=\n\t"
         "ldrh	lr, [%[in]], #2\n\t"
         "sub	r4, r4, #2\n\t"
         "strh	lr, [r9], #2\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_192_start_byte_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_192_start_byte_%=:\n\t"
         "cmp	r4, #1\n\t"
         "blt	L_aes_gcm_decrypt_arm32_crypto_192_end_bytes_%=\n\t"
         "ldrb	lr, [%[in]], #1\n\t"
@@ -6720,7 +6718,7 @@ WC_OMIT_FRAME_POINTER int AES_GCM_decrypt_AARCH32(const byte* in, byte* out,
         "strb	lr, [r9], #1\n\t"
         "bne	L_aes_gcm_decrypt_arm32_crypto_192_start_byte_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_192_end_bytes_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_192_end_bytes_%=:\n\t"
         "sub	r9, r9, r11\n\t"
         "add	r5, r5, #1\n\t"
         "vmov.8	q4, q6\n\t"
@@ -6760,21 +6758,21 @@ WC_OMIT_FRAME_POINTER int AES_GCM_decrypt_AARCH32(const byte* in, byte* out,
         "cmp	r4, #4\n\t"
         "blt	L_aes_gcm_decrypt_arm32_crypto_192_out_start_sw_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_192_out_start_dw_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_192_out_start_dw_%=:\n\t"
         "ldr	lr, [r9], #4\n\t"
         "sub	r4, r4, #4\n\t"
         "str	lr, [%[out]], #4\n\t"
         "cmp	r4, #4\n\t"
         "bge	L_aes_gcm_decrypt_arm32_crypto_192_out_start_dw_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_192_out_start_sw_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_192_out_start_sw_%=:\n\t"
         "cmp	r4, #2\n\t"
         "blt	L_aes_gcm_decrypt_arm32_crypto_192_out_start_byte_%=\n\t"
         "ldrh	lr, [r9], #2\n\t"
         "sub	r4, r4, #2\n\t"
         "strh	lr, [%[out]], #2\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_192_out_start_byte_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_192_out_start_byte_%=:\n\t"
         "cmp	r4, #1\n\t"
         "blt	L_aes_gcm_decrypt_arm32_crypto_192_out_end_bytes_%=\n\t"
         "ldrb	lr, [r9], #1\n\t"
@@ -6782,9 +6780,9 @@ WC_OMIT_FRAME_POINTER int AES_GCM_decrypt_AARCH32(const byte* in, byte* out,
         "strb	lr, [%[out]], #1\n\t"
         "bne	L_aes_gcm_decrypt_arm32_crypto_192_out_start_byte_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_192_out_end_bytes_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_192_out_end_bytes_%=:\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_192_partial_done_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_192_partial_done_%=:\n\t"
         /* Finish */
         "add	r8, %[sz], #15\n\t"
         "sub	r8, r5, r8, lsr #4\n\t"
@@ -6821,13 +6819,13 @@ WC_OMIT_FRAME_POINTER int AES_GCM_decrypt_AARCH32(const byte* in, byte* out,
         "b	L_aes_gcm_decrypt_arm32_crypto_done_enc_%=\n\t"
         /* AES_GCM_256 */
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_start_256_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_start_256_%=:\n\t"
 #ifndef NO_AES_256
         "cmp	r10, #1\n\t"
         "blt	L_aes_gcm_decrypt_arm32_crypto_256_done_%=\n\t"
         "beq	L_aes_gcm_decrypt_arm32_crypto_256_start_1_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_256_start_2_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_256_start_2_%=:\n\t"
         "add	r8, r5, #1\n\t"
         "vmov.8	q4, q6\n\t"
         "add	r5, r5, #2\n\t"
@@ -6906,7 +6904,7 @@ WC_OMIT_FRAME_POINTER int AES_GCM_decrypt_AARCH32(const byte* in, byte* out,
         "bgt	L_aes_gcm_decrypt_arm32_crypto_256_start_2_%=\n\t"
         "blt	L_aes_gcm_decrypt_arm32_crypto_256_done_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_256_start_1_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_256_start_1_%=:\n\t"
         "add	r5, r5, #1\n\t"
         "vmov.8	q4, q6\n\t"
         "rev	r8, r5\n\t"
@@ -6948,7 +6946,7 @@ WC_OMIT_FRAME_POINTER int AES_GCM_decrypt_AARCH32(const byte* in, byte* out,
         "veor.8	q14, q14, q4\n\t"
         "vst1.32	{q14}, [%[out]]!\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_256_done_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_256_done_%=:\n\t"
         "ands	r11, %[sz], #15\n\t"
         "beq	L_aes_gcm_decrypt_arm32_crypto_256_partial_done_%=\n\t"
         "veor.8	q14, q14, q14\n\t"
@@ -6957,21 +6955,21 @@ WC_OMIT_FRAME_POINTER int AES_GCM_decrypt_AARCH32(const byte* in, byte* out,
         "cmp	r4, #4\n\t"
         "blt	L_aes_gcm_decrypt_arm32_crypto_256_start_sw_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_256_start_dw_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_256_start_dw_%=:\n\t"
         "ldr	lr, [%[in]], #4\n\t"
         "sub	r4, r4, #4\n\t"
         "str	lr, [r9], #4\n\t"
         "cmp	r4, #4\n\t"
         "bge	L_aes_gcm_decrypt_arm32_crypto_256_start_dw_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_256_start_sw_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_256_start_sw_%=:\n\t"
         "cmp	r4, #2\n\t"
         "blt	L_aes_gcm_decrypt_arm32_crypto_256_start_byte_%=\n\t"
         "ldrh	lr, [%[in]], #2\n\t"
         "sub	r4, r4, #2\n\t"
         "strh	lr, [r9], #2\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_256_start_byte_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_256_start_byte_%=:\n\t"
         "cmp	r4, #1\n\t"
         "blt	L_aes_gcm_decrypt_arm32_crypto_256_end_bytes_%=\n\t"
         "ldrb	lr, [%[in]], #1\n\t"
@@ -6979,7 +6977,7 @@ WC_OMIT_FRAME_POINTER int AES_GCM_decrypt_AARCH32(const byte* in, byte* out,
         "strb	lr, [r9], #1\n\t"
         "bne	L_aes_gcm_decrypt_arm32_crypto_256_start_byte_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_256_end_bytes_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_256_end_bytes_%=:\n\t"
         "sub	r9, r9, r11\n\t"
         "add	r5, r5, #1\n\t"
         "vmov.8	q4, q6\n\t"
@@ -7025,21 +7023,21 @@ WC_OMIT_FRAME_POINTER int AES_GCM_decrypt_AARCH32(const byte* in, byte* out,
         "cmp	r4, #4\n\t"
         "blt	L_aes_gcm_decrypt_arm32_crypto_256_out_start_sw_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_256_out_start_dw_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_256_out_start_dw_%=:\n\t"
         "ldr	lr, [r9], #4\n\t"
         "sub	r4, r4, #4\n\t"
         "str	lr, [%[out]], #4\n\t"
         "cmp	r4, #4\n\t"
         "bge	L_aes_gcm_decrypt_arm32_crypto_256_out_start_dw_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_256_out_start_sw_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_256_out_start_sw_%=:\n\t"
         "cmp	r4, #2\n\t"
         "blt	L_aes_gcm_decrypt_arm32_crypto_256_out_start_byte_%=\n\t"
         "ldrh	lr, [r9], #2\n\t"
         "sub	r4, r4, #2\n\t"
         "strh	lr, [%[out]], #2\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_256_out_start_byte_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_256_out_start_byte_%=:\n\t"
         "cmp	r4, #1\n\t"
         "blt	L_aes_gcm_decrypt_arm32_crypto_256_out_end_bytes_%=\n\t"
         "ldrb	lr, [r9], #1\n\t"
@@ -7047,9 +7045,9 @@ WC_OMIT_FRAME_POINTER int AES_GCM_decrypt_AARCH32(const byte* in, byte* out,
         "strb	lr, [%[out]], #1\n\t"
         "bne	L_aes_gcm_decrypt_arm32_crypto_256_out_start_byte_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_256_out_end_bytes_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_256_out_end_bytes_%=:\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_256_partial_done_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_256_partial_done_%=:\n\t"
         /* Finish */
         "add	r8, %[sz], #15\n\t"
         "sub	r8, r5, r8, lsr #4\n\t"
@@ -7092,13 +7090,13 @@ WC_OMIT_FRAME_POINTER int AES_GCM_decrypt_AARCH32(const byte* in, byte* out,
         "b	L_aes_gcm_decrypt_arm32_crypto_done_enc_%=\n\t"
         /* AES_GCM_128 */
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_start_128_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_start_128_%=:\n\t"
 #ifndef NO_AES_128
         "cmp	r10, #1\n\t"
         "blt	L_aes_gcm_decrypt_arm32_crypto_128_done_%=\n\t"
         "beq	L_aes_gcm_decrypt_arm32_crypto_128_start_1_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_128_start_2_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_128_start_2_%=:\n\t"
         "add	r8, r5, #1\n\t"
         "vmov.8	q4, q6\n\t"
         "add	r5, r5, #2\n\t"
@@ -7156,7 +7154,7 @@ WC_OMIT_FRAME_POINTER int AES_GCM_decrypt_AARCH32(const byte* in, byte* out,
         "bgt	L_aes_gcm_decrypt_arm32_crypto_128_start_2_%=\n\t"
         "blt	L_aes_gcm_decrypt_arm32_crypto_128_done_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_128_start_1_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_128_start_1_%=:\n\t"
         "add	r5, r5, #1\n\t"
         "vmov.8	q4, q6\n\t"
         "rev	r8, r5\n\t"
@@ -7185,7 +7183,7 @@ WC_OMIT_FRAME_POINTER int AES_GCM_decrypt_AARCH32(const byte* in, byte* out,
         "veor.8	q14, q14, q4\n\t"
         "vst1.32	{q14}, [%[out]]!\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_128_done_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_128_done_%=:\n\t"
         "ands	r11, %[sz], #15\n\t"
         "beq	L_aes_gcm_decrypt_arm32_crypto_128_partial_done_%=\n\t"
         "veor.8	q14, q14, q14\n\t"
@@ -7194,21 +7192,21 @@ WC_OMIT_FRAME_POINTER int AES_GCM_decrypt_AARCH32(const byte* in, byte* out,
         "cmp	r4, #4\n\t"
         "blt	L_aes_gcm_decrypt_arm32_crypto_128_start_sw_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_128_start_dw_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_128_start_dw_%=:\n\t"
         "ldr	lr, [%[in]], #4\n\t"
         "sub	r4, r4, #4\n\t"
         "str	lr, [r9], #4\n\t"
         "cmp	r4, #4\n\t"
         "bge	L_aes_gcm_decrypt_arm32_crypto_128_start_dw_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_128_start_sw_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_128_start_sw_%=:\n\t"
         "cmp	r4, #2\n\t"
         "blt	L_aes_gcm_decrypt_arm32_crypto_128_start_byte_%=\n\t"
         "ldrh	lr, [%[in]], #2\n\t"
         "sub	r4, r4, #2\n\t"
         "strh	lr, [r9], #2\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_128_start_byte_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_128_start_byte_%=:\n\t"
         "cmp	r4, #1\n\t"
         "blt	L_aes_gcm_decrypt_arm32_crypto_128_end_bytes_%=\n\t"
         "ldrb	lr, [%[in]], #1\n\t"
@@ -7216,7 +7214,7 @@ WC_OMIT_FRAME_POINTER int AES_GCM_decrypt_AARCH32(const byte* in, byte* out,
         "strb	lr, [r9], #1\n\t"
         "bne	L_aes_gcm_decrypt_arm32_crypto_128_start_byte_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_128_end_bytes_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_128_end_bytes_%=:\n\t"
         "sub	r9, r9, r11\n\t"
         "add	r5, r5, #1\n\t"
         "vmov.8	q4, q6\n\t"
@@ -7249,21 +7247,21 @@ WC_OMIT_FRAME_POINTER int AES_GCM_decrypt_AARCH32(const byte* in, byte* out,
         "cmp	r4, #4\n\t"
         "blt	L_aes_gcm_decrypt_arm32_crypto_128_out_start_sw_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_128_out_start_dw_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_128_out_start_dw_%=:\n\t"
         "ldr	lr, [r9], #4\n\t"
         "sub	r4, r4, #4\n\t"
         "str	lr, [%[out]], #4\n\t"
         "cmp	r4, #4\n\t"
         "bge	L_aes_gcm_decrypt_arm32_crypto_128_out_start_dw_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_128_out_start_sw_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_128_out_start_sw_%=:\n\t"
         "cmp	r4, #2\n\t"
         "blt	L_aes_gcm_decrypt_arm32_crypto_128_out_start_byte_%=\n\t"
         "ldrh	lr, [r9], #2\n\t"
         "sub	r4, r4, #2\n\t"
         "strh	lr, [%[out]], #2\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_128_out_start_byte_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_128_out_start_byte_%=:\n\t"
         "cmp	r4, #1\n\t"
         "blt	L_aes_gcm_decrypt_arm32_crypto_128_out_end_bytes_%=\n\t"
         "ldrb	lr, [r9], #1\n\t"
@@ -7271,9 +7269,9 @@ WC_OMIT_FRAME_POINTER int AES_GCM_decrypt_AARCH32(const byte* in, byte* out,
         "strb	lr, [%[out]], #1\n\t"
         "bne	L_aes_gcm_decrypt_arm32_crypto_128_out_start_byte_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_128_out_end_bytes_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_128_out_end_bytes_%=:\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_128_partial_done_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_128_partial_done_%=:\n\t"
         /* Finish */
         "add	r8, %[sz], #15\n\t"
         "sub	r8, r5, r8, lsr #4\n\t"
@@ -7301,7 +7299,7 @@ WC_OMIT_FRAME_POINTER int AES_GCM_decrypt_AARCH32(const byte* in, byte* out,
         "veor.8	q6, q6, q13\n\t"
 #endif /* !NO_AES_128 */
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_done_enc_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_done_enc_%=:\n\t"
         "vmov.i8	q13, #0x87\n\t"
         "vshr.u64	q13, q13, #56\n\t"
         /* gcm_h */
@@ -7359,28 +7357,28 @@ WC_OMIT_FRAME_POINTER int AES_GCM_decrypt_AARCH32(const byte* in, byte* out,
         "vld1.8	{q0}, [lr]\n\t"
         "b	L_aes_gcm_decrypt_arm32_crypto_tag_tag_loaded_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_tag_part_tag_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_tag_part_tag_%=:\n\t"
         "veor.8	q0, q0, q0\n\t"
         "mov	r12, r4\n\t"
         "vst1.32	{q0}, [r9]\n\t"
         "cmp	r12, #4\n\t"
         "blt	L_aes_gcm_decrypt_arm32_crypto_tag_tag_start_sw_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_tag_tag_start_dw_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_tag_tag_start_dw_%=:\n\t"
         "ldr	r8, [lr], #4\n\t"
         "sub	r12, r12, #4\n\t"
         "str	r8, [r9], #4\n\t"
         "cmp	r12, #4\n\t"
         "bge	L_aes_gcm_decrypt_arm32_crypto_tag_tag_start_dw_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_tag_tag_start_sw_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_tag_tag_start_sw_%=:\n\t"
         "cmp	r12, #2\n\t"
         "blt	L_aes_gcm_decrypt_arm32_crypto_tag_tag_start_byte_%=\n\t"
         "ldrh	r8, [lr], #2\n\t"
         "sub	r12, r12, #2\n\t"
         "strh	r8, [r9], #2\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_tag_tag_start_byte_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_tag_tag_start_byte_%=:\n\t"
         "cmp	r12, #1\n\t"
         "blt	L_aes_gcm_decrypt_arm32_crypto_tag_tag_end_bytes_%=\n\t"
         "ldrb	r8, [lr], #1\n\t"
@@ -7388,7 +7386,7 @@ WC_OMIT_FRAME_POINTER int AES_GCM_decrypt_AARCH32(const byte* in, byte* out,
         "strb	r8, [r9], #1\n\t"
         "bne	L_aes_gcm_decrypt_arm32_crypto_tag_tag_start_byte_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_tag_tag_end_bytes_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_tag_tag_end_bytes_%=:\n\t"
         "sub	r9, r9, r4\n\t"
         "vld1.32	{q0}, [r9]\n\t"
         "mov	r12, #16\n\t"
@@ -7397,14 +7395,14 @@ WC_OMIT_FRAME_POINTER int AES_GCM_decrypt_AARCH32(const byte* in, byte* out,
         "eor	r8, r8, r8\n\t"
         "add	r9, r9, r4\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_tag_calc_tag_byte_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_tag_calc_tag_byte_%=:\n\t"
         "strb	r8, [r9], #1\n\t"
         "subs	r12, r12, #1\n\t"
         "bne	L_aes_gcm_decrypt_arm32_crypto_tag_calc_tag_byte_%=\n\t"
         "subs	r9, r9, #16\n\t"
         "vld1.32	{q7}, [r9]\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm32_crypto_tag_tag_loaded_%=: \n\t"
+    "L_aes_gcm_decrypt_arm32_crypto_tag_tag_loaded_%=:\n\t"
         "vceq.i32	q0, q0, q7\n\t"
         "vmov	r5, s0\n\t"
         "vmov	r8, s1\n\t"
@@ -7448,17 +7446,17 @@ WC_OMIT_FRAME_POINTER void AES_XTS_encrypt_AARCH32(const byte* in_p,
 #else
 WC_OMIT_FRAME_POINTER void AES_XTS_encrypt_AARCH32(const byte* in, byte* out,
     word32 sz, const byte* i, byte* key, byte* key2, byte* tmp, int nr)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register const byte* in asm ("r0") = (const byte*)in_p;
-    register byte* out asm ("r1") = (byte*)out_p;
-    register word32 sz asm ("r2") = (word32)sz_p;
-    register const byte* i asm ("r3") = (const byte*)i_p;
-    register byte* key asm ("r12") = (byte*)key_p;
-    register byte* key2 asm ("lr") = (byte*)key2_p;
-    register byte* tmp asm ("r4") = (byte*)tmp_p;
-    register int nr asm ("r5") = (int)nr_p;
+    register const byte* in __asm__ ("r0") = (const byte*)in_p;
+    register byte* out __asm__ ("r1") = (byte*)out_p;
+    register word32 sz __asm__ ("r2") = (word32)sz_p;
+    register const byte* i __asm__ ("r3") = (const byte*)i_p;
+    register byte* key __asm__ ("r12") = (byte*)key_p;
+    register byte* key2 __asm__ ("lr") = (byte*)key2_p;
+    register byte* tmp __asm__ ("r4") = (byte*)tmp_p;
+    register int nr __asm__ ("r5") = (int)nr_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -7509,7 +7507,7 @@ WC_OMIT_FRAME_POINTER void AES_XTS_encrypt_AARCH32(const byte* in, byte* out,
         "cmp	r4, #1\n\t"
         "blt	L_aes_xts_encrypt_arm32_crypto_192_done_%=\n\t"
         "\n"
-    "L_aes_xts_encrypt_arm32_crypto_192_start_1_%=: \n\t"
+    "L_aes_xts_encrypt_arm32_crypto_192_start_1_%=:\n\t"
         "vld1.8	{q1}, [%[in]]!\n\t"
         "veor.32	q1, q1, q0\n\t"
         "aese.8	q1, q2\n\t"
@@ -7551,7 +7549,7 @@ WC_OMIT_FRAME_POINTER void AES_XTS_encrypt_AARCH32(const byte* in, byte* out,
         "vst1.8	{q1}, [%[out]]!\n\t"
         "bne	L_aes_xts_encrypt_arm32_crypto_192_start_1_%=\n\t"
         "\n"
-    "L_aes_xts_encrypt_arm32_crypto_192_done_%=: \n\t"
+    "L_aes_xts_encrypt_arm32_crypto_192_done_%=:\n\t"
         "cmp	%[sz], #0\n\t"
         "beq	L_aes_xts_encrypt_arm32_crypto_192_partial_done_%=\n\t"
         "sub	%[out], %[out], #16\n\t"
@@ -7560,7 +7558,7 @@ WC_OMIT_FRAME_POINTER void AES_XTS_encrypt_AARCH32(const byte* in, byte* out,
         "vst1.32	{q1}, [r4]\n\t"
         "mov	r5, %[sz]\n\t"
         "\n"
-    "L_aes_xts_encrypt_arm32_crypto_192_start_byte_%=: \n\t"
+    "L_aes_xts_encrypt_arm32_crypto_192_start_byte_%=:\n\t"
         "ldrb	r8, [r4]\n\t"
         "ldrb	r9, [%[in]], #1\n\t"
         "strb	r8, [%[out]], #1\n\t"
@@ -7599,12 +7597,12 @@ WC_OMIT_FRAME_POINTER void AES_XTS_encrypt_AARCH32(const byte* in, byte* out,
         "veor.32	q1, q1, q0\n\t"
         "vst1.8	{q1}, [%[out]]\n\t"
         "\n"
-    "L_aes_xts_encrypt_arm32_crypto_192_partial_done_%=: \n\t"
+    "L_aes_xts_encrypt_arm32_crypto_192_partial_done_%=:\n\t"
 #endif /* !NO_AES_192 */
         "b	L_aes_xts_encrypt_arm32_crypto_done_%=\n\t"
         /* AES_XTS_256 */
         "\n"
-    "L_aes_xts_encrypt_arm32_crypto_start_256_%=: \n\t"
+    "L_aes_xts_encrypt_arm32_crypto_start_256_%=:\n\t"
 #ifndef NO_AES_256
         "vldm.32	lr!, {q10-q13}\n\t"
         "aese.8	q0, q2\n\t"
@@ -7646,7 +7644,7 @@ WC_OMIT_FRAME_POINTER void AES_XTS_encrypt_AARCH32(const byte* in, byte* out,
         "cmp	r4, #1\n\t"
         "blt	L_aes_xts_encrypt_arm32_crypto_256_done_%=\n\t"
         "\n"
-    "L_aes_xts_encrypt_arm32_crypto_256_start_1_%=: \n\t"
+    "L_aes_xts_encrypt_arm32_crypto_256_start_1_%=:\n\t"
         "vld1.8	{q1}, [%[in]]!\n\t"
         "veor.32	q1, q1, q0\n\t"
         "aese.8	q1, q2\n\t"
@@ -7696,7 +7694,7 @@ WC_OMIT_FRAME_POINTER void AES_XTS_encrypt_AARCH32(const byte* in, byte* out,
         "vst1.8	{q1}, [%[out]]!\n\t"
         "bne	L_aes_xts_encrypt_arm32_crypto_256_start_1_%=\n\t"
         "\n"
-    "L_aes_xts_encrypt_arm32_crypto_256_done_%=: \n\t"
+    "L_aes_xts_encrypt_arm32_crypto_256_done_%=:\n\t"
         "cmp	%[sz], #0\n\t"
         "beq	L_aes_xts_encrypt_arm32_crypto_256_partial_done_%=\n\t"
         "sub	%[out], %[out], #16\n\t"
@@ -7705,7 +7703,7 @@ WC_OMIT_FRAME_POINTER void AES_XTS_encrypt_AARCH32(const byte* in, byte* out,
         "vst1.32	{q1}, [r4]\n\t"
         "mov	r5, %[sz]\n\t"
         "\n"
-    "L_aes_xts_encrypt_arm32_crypto_256_start_byte_%=: \n\t"
+    "L_aes_xts_encrypt_arm32_crypto_256_start_byte_%=:\n\t"
         "ldrb	r8, [r4]\n\t"
         "ldrb	r9, [%[in]], #1\n\t"
         "strb	r8, [%[out]], #1\n\t"
@@ -7751,12 +7749,12 @@ WC_OMIT_FRAME_POINTER void AES_XTS_encrypt_AARCH32(const byte* in, byte* out,
         "veor.32	q1, q1, q0\n\t"
         "vst1.8	{q1}, [%[out]]\n\t"
         "\n"
-    "L_aes_xts_encrypt_arm32_crypto_256_partial_done_%=: \n\t"
+    "L_aes_xts_encrypt_arm32_crypto_256_partial_done_%=:\n\t"
 #endif /* !NO_AES_256 */
         "b	L_aes_xts_encrypt_arm32_crypto_done_%=\n\t"
         /* AES_XTS_128 */
         "\n"
-    "L_aes_xts_encrypt_arm32_crypto_start_128_%=: \n\t"
+    "L_aes_xts_encrypt_arm32_crypto_start_128_%=:\n\t"
 #ifndef NO_AES_128
         "vldm.32	lr!, {q10-q12}\n\t"
         "aese.8	q0, q2\n\t"
@@ -7787,7 +7785,7 @@ WC_OMIT_FRAME_POINTER void AES_XTS_encrypt_AARCH32(const byte* in, byte* out,
         "cmp	r4, #1\n\t"
         "blt	L_aes_xts_encrypt_arm32_crypto_128_done_%=\n\t"
         "\n"
-    "L_aes_xts_encrypt_arm32_crypto_128_start_1_%=: \n\t"
+    "L_aes_xts_encrypt_arm32_crypto_128_start_1_%=:\n\t"
         "vld1.8	{q1}, [%[in]]!\n\t"
         "veor.32	q1, q1, q0\n\t"
         "aese.8	q1, q2\n\t"
@@ -7825,7 +7823,7 @@ WC_OMIT_FRAME_POINTER void AES_XTS_encrypt_AARCH32(const byte* in, byte* out,
         "vst1.8	{q1}, [%[out]]!\n\t"
         "bne	L_aes_xts_encrypt_arm32_crypto_128_start_1_%=\n\t"
         "\n"
-    "L_aes_xts_encrypt_arm32_crypto_128_done_%=: \n\t"
+    "L_aes_xts_encrypt_arm32_crypto_128_done_%=:\n\t"
         "cmp	%[sz], #0\n\t"
         "beq	L_aes_xts_encrypt_arm32_crypto_128_partial_done_%=\n\t"
         "sub	%[out], %[out], #16\n\t"
@@ -7834,7 +7832,7 @@ WC_OMIT_FRAME_POINTER void AES_XTS_encrypt_AARCH32(const byte* in, byte* out,
         "vst1.32	{q1}, [r4]\n\t"
         "mov	r5, %[sz]\n\t"
         "\n"
-    "L_aes_xts_encrypt_arm32_crypto_128_start_byte_%=: \n\t"
+    "L_aes_xts_encrypt_arm32_crypto_128_start_byte_%=:\n\t"
         "ldrb	r8, [r4]\n\t"
         "ldrb	r9, [%[in]], #1\n\t"
         "strb	r8, [%[out]], #1\n\t"
@@ -7869,10 +7867,10 @@ WC_OMIT_FRAME_POINTER void AES_XTS_encrypt_AARCH32(const byte* in, byte* out,
         "veor.32	q1, q1, q0\n\t"
         "vst1.8	{q1}, [%[out]]\n\t"
         "\n"
-    "L_aes_xts_encrypt_arm32_crypto_128_partial_done_%=: \n\t"
+    "L_aes_xts_encrypt_arm32_crypto_128_partial_done_%=:\n\t"
 #endif /* !NO_AES_128 */
         "\n"
-    "L_aes_xts_encrypt_arm32_crypto_done_%=: \n\t"
+    "L_aes_xts_encrypt_arm32_crypto_done_%=:\n\t"
         "pop	{%[key], %[key2]}\n\t"
         "pop	{%[tmp], %[nr]}\n\t"
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
@@ -7899,17 +7897,17 @@ WC_OMIT_FRAME_POINTER void AES_XTS_decrypt_AARCH32(const byte* in_p,
 #else
 WC_OMIT_FRAME_POINTER void AES_XTS_decrypt_AARCH32(const byte* in, byte* out,
     word32 sz, const byte* i, byte* key, byte* key2, byte* tmp, int nr)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register const byte* in asm ("r0") = (const byte*)in_p;
-    register byte* out asm ("r1") = (byte*)out_p;
-    register word32 sz asm ("r2") = (word32)sz_p;
-    register const byte* i asm ("r3") = (const byte*)i_p;
-    register byte* key asm ("r12") = (byte*)key_p;
-    register byte* key2 asm ("lr") = (byte*)key2_p;
-    register byte* tmp asm ("r4") = (byte*)tmp_p;
-    register int nr asm ("r5") = (int)nr_p;
+    register const byte* in __asm__ ("r0") = (const byte*)in_p;
+    register byte* out __asm__ ("r1") = (byte*)out_p;
+    register word32 sz __asm__ ("r2") = (word32)sz_p;
+    register const byte* i __asm__ ("r3") = (const byte*)i_p;
+    register byte* key __asm__ ("r12") = (byte*)key_p;
+    register byte* key2 __asm__ ("lr") = (byte*)key2_p;
+    register byte* tmp __asm__ ("r4") = (byte*)tmp_p;
+    register int nr __asm__ ("r5") = (int)nr_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -7963,7 +7961,7 @@ WC_OMIT_FRAME_POINTER void AES_XTS_decrypt_AARCH32(const byte* in, byte* out,
         "cmp	r4, #1\n\t"
         "blt	L_aes_xts_decrypt_arm32_crypto_192_done_%=\n\t"
         "\n"
-    "L_aes_xts_decrypt_arm32_crypto_192_start_1_%=: \n\t"
+    "L_aes_xts_decrypt_arm32_crypto_192_start_1_%=:\n\t"
         "vld1.8	{q0}, [%[in]]!\n\t"
         "veor.32	q0, q0, q1\n\t"
         "aesd.8	q0, q2\n\t"
@@ -8005,7 +8003,7 @@ WC_OMIT_FRAME_POINTER void AES_XTS_decrypt_AARCH32(const byte* in, byte* out,
         "vst1.8	{q0}, [%[out]]!\n\t"
         "bne	L_aes_xts_decrypt_arm32_crypto_192_start_1_%=\n\t"
         "\n"
-    "L_aes_xts_decrypt_arm32_crypto_192_done_%=: \n\t"
+    "L_aes_xts_decrypt_arm32_crypto_192_done_%=:\n\t"
         "cmp	%[sz], #0\n\t"
         "beq	L_aes_xts_decrypt_arm32_crypto_192_partial_done_%=\n\t"
         "and	r5, lr, r9, asr #31\n\t"
@@ -8050,7 +8048,7 @@ WC_OMIT_FRAME_POINTER void AES_XTS_decrypt_AARCH32(const byte* in, byte* out,
         "add	%[out], %[out], #16\n\t"
         "mov	r5, %[sz]\n\t"
         "\n"
-    "L_aes_xts_decrypt_arm32_crypto_192_start_byte_%=: \n\t"
+    "L_aes_xts_decrypt_arm32_crypto_192_start_byte_%=:\n\t"
         "ldrb	r8, [r4]\n\t"
         "ldrb	r9, [%[in]], #1\n\t"
         "strb	r8, [%[out]], #1\n\t"
@@ -8089,12 +8087,12 @@ WC_OMIT_FRAME_POINTER void AES_XTS_decrypt_AARCH32(const byte* in, byte* out,
         "veor.32	q0, q0, q1\n\t"
         "vst1.8	{q0}, [%[out]]\n\t"
         "\n"
-    "L_aes_xts_decrypt_arm32_crypto_192_partial_done_%=: \n\t"
+    "L_aes_xts_decrypt_arm32_crypto_192_partial_done_%=:\n\t"
 #endif /* !NO_AES_192 */
         "b	L_aes_xts_decrypt_arm32_crypto_done_%=\n\t"
         /* AES_XTS_256 */
         "\n"
-    "L_aes_xts_decrypt_arm32_crypto_start_256_%=: \n\t"
+    "L_aes_xts_decrypt_arm32_crypto_start_256_%=:\n\t"
 #ifndef NO_AES_256
         "vldm.32	lr!, {q10-q13}\n\t"
         "aese.8	q1, q2\n\t"
@@ -8136,7 +8134,7 @@ WC_OMIT_FRAME_POINTER void AES_XTS_decrypt_AARCH32(const byte* in, byte* out,
         "cmp	r4, #1\n\t"
         "blt	L_aes_xts_decrypt_arm32_crypto_256_done_%=\n\t"
         "\n"
-    "L_aes_xts_decrypt_arm32_crypto_256_start_1_%=: \n\t"
+    "L_aes_xts_decrypt_arm32_crypto_256_start_1_%=:\n\t"
         "vld1.8	{q0}, [%[in]]!\n\t"
         "veor.32	q0, q0, q1\n\t"
         "aesd.8	q0, q2\n\t"
@@ -8187,7 +8185,7 @@ WC_OMIT_FRAME_POINTER void AES_XTS_decrypt_AARCH32(const byte* in, byte* out,
         "vst1.8	{q0}, [%[out]]!\n\t"
         "bne	L_aes_xts_decrypt_arm32_crypto_256_start_1_%=\n\t"
         "\n"
-    "L_aes_xts_decrypt_arm32_crypto_256_done_%=: \n\t"
+    "L_aes_xts_decrypt_arm32_crypto_256_done_%=:\n\t"
         "cmp	%[sz], #0\n\t"
         "beq	L_aes_xts_decrypt_arm32_crypto_256_partial_done_%=\n\t"
         "and	r5, lr, r9, asr #31\n\t"
@@ -8241,7 +8239,7 @@ WC_OMIT_FRAME_POINTER void AES_XTS_decrypt_AARCH32(const byte* in, byte* out,
         "add	%[out], %[out], #16\n\t"
         "mov	r5, %[sz]\n\t"
         "\n"
-    "L_aes_xts_decrypt_arm32_crypto_256_start_byte_%=: \n\t"
+    "L_aes_xts_decrypt_arm32_crypto_256_start_byte_%=:\n\t"
         "ldrb	r8, [r4]\n\t"
         "ldrb	r9, [%[in]], #1\n\t"
         "strb	r8, [%[out]], #1\n\t"
@@ -8288,12 +8286,12 @@ WC_OMIT_FRAME_POINTER void AES_XTS_decrypt_AARCH32(const byte* in, byte* out,
         "veor.32	q0, q0, q1\n\t"
         "vst1.8	{q0}, [%[out]]\n\t"
         "\n"
-    "L_aes_xts_decrypt_arm32_crypto_256_partial_done_%=: \n\t"
+    "L_aes_xts_decrypt_arm32_crypto_256_partial_done_%=:\n\t"
 #endif /* !NO_AES_256 */
         "b	L_aes_xts_decrypt_arm32_crypto_done_%=\n\t"
         /* AES_XTS_128 */
         "\n"
-    "L_aes_xts_decrypt_arm32_crypto_start_128_%=: \n\t"
+    "L_aes_xts_decrypt_arm32_crypto_start_128_%=:\n\t"
 #ifndef NO_AES_128
         "vldm.32	lr!, {q10-q12}\n\t"
         "aese.8	q1, q2\n\t"
@@ -8324,7 +8322,7 @@ WC_OMIT_FRAME_POINTER void AES_XTS_decrypt_AARCH32(const byte* in, byte* out,
         "cmp	r4, #1\n\t"
         "blt	L_aes_xts_decrypt_arm32_crypto_128_done_%=\n\t"
         "\n"
-    "L_aes_xts_decrypt_arm32_crypto_128_start_1_%=: \n\t"
+    "L_aes_xts_decrypt_arm32_crypto_128_start_1_%=:\n\t"
         "vld1.8	{q0}, [%[in]]!\n\t"
         "veor.32	q0, q0, q1\n\t"
         "aesd.8	q0, q2\n\t"
@@ -8362,7 +8360,7 @@ WC_OMIT_FRAME_POINTER void AES_XTS_decrypt_AARCH32(const byte* in, byte* out,
         "vst1.8	{q0}, [%[out]]!\n\t"
         "bne	L_aes_xts_decrypt_arm32_crypto_128_start_1_%=\n\t"
         "\n"
-    "L_aes_xts_decrypt_arm32_crypto_128_done_%=: \n\t"
+    "L_aes_xts_decrypt_arm32_crypto_128_done_%=:\n\t"
         "cmp	%[sz], #0\n\t"
         "beq	L_aes_xts_decrypt_arm32_crypto_128_partial_done_%=\n\t"
         "and	r5, lr, r9, asr #31\n\t"
@@ -8403,7 +8401,7 @@ WC_OMIT_FRAME_POINTER void AES_XTS_decrypt_AARCH32(const byte* in, byte* out,
         "add	%[out], %[out], #16\n\t"
         "mov	r5, %[sz]\n\t"
         "\n"
-    "L_aes_xts_decrypt_arm32_crypto_128_start_byte_%=: \n\t"
+    "L_aes_xts_decrypt_arm32_crypto_128_start_byte_%=:\n\t"
         "ldrb	r8, [r4]\n\t"
         "ldrb	r9, [%[in]], #1\n\t"
         "strb	r8, [%[out]], #1\n\t"
@@ -8438,10 +8436,10 @@ WC_OMIT_FRAME_POINTER void AES_XTS_decrypt_AARCH32(const byte* in, byte* out,
         "veor.32	q0, q0, q1\n\t"
         "vst1.8	{q0}, [%[out]]\n\t"
         "\n"
-    "L_aes_xts_decrypt_arm32_crypto_128_partial_done_%=: \n\t"
+    "L_aes_xts_decrypt_arm32_crypto_128_partial_done_%=:\n\t"
 #endif /* !NO_AES_128 */
         "\n"
-    "L_aes_xts_decrypt_arm32_crypto_done_%=: \n\t"
+    "L_aes_xts_decrypt_arm32_crypto_done_%=:\n\t"
         "pop	{%[key], %[key2]}\n\t"
         "pop	{%[tmp], %[nr]}\n\t"
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
@@ -8464,7 +8462,7 @@ WC_OMIT_FRAME_POINTER void AES_XTS_decrypt_AARCH32(const byte* in, byte* out,
 #endif /* WOLFSSL_AES_XTS */
 #else
 #ifdef HAVE_AES_DECRYPT
-static const word32 L_AES_ARM32_td_data[] = {
+XALIGNED(8) static const word32 L_AES_ARM32_td_data[] = {
     0x5051f4a7, 0x537e4165, 0xc31a17a4, 0x963a275e,
     0xcb3bab6b, 0xf11f9d45, 0xabacfa58, 0x934be303,
     0x552030fa, 0xf6ad766d, 0x9188cc76, 0x25f5024c,
@@ -8535,7 +8533,7 @@ static const word32 L_AES_ARM32_td_data[] = {
 #if defined(HAVE_AES_DECRYPT) || defined(HAVE_AES_CBC) || \
     defined(HAVE_AESCCM) || defined(HAVE_AESGCM) || \
     defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER)
-static const word32 L_AES_ARM32_te_data[] = {
+XALIGNED(8) static const word32 L_AES_ARM32_te_data[] = {
     0xa5c66363, 0x84f87c7c, 0x99ee7777, 0x8df67b7b,
     0x0dfff2f2, 0xbdd66b6b, 0xb1de6f6f, 0x5491c5c5,
     0x50603030, 0x03020101, 0xa9ce6767, 0x7d562b2b,
@@ -8619,13 +8617,13 @@ void AES_invert_key(unsigned char* ks_p, word32 rounds_p);
 WC_OMIT_FRAME_POINTER void AES_invert_key(unsigned char* ks_p, word32 rounds_p)
 #else
 WC_OMIT_FRAME_POINTER void AES_invert_key(unsigned char* ks, word32 rounds)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register unsigned char* ks asm ("r0") = (unsigned char*)ks_p;
-    register word32 rounds asm ("r1") = (word32)rounds_p;
-    register word32* L_AES_ARM32_te_c asm ("r2") = (word32*)L_AES_ARM32_te;
-    register word32* L_AES_ARM32_td_c asm ("r3") = (word32*)L_AES_ARM32_td;
+    register unsigned char* ks __asm__ ("r0") = (unsigned char*)ks_p;
+    register word32 rounds __asm__ ("r1") = (word32)rounds_p;
+    register word32* L_AES_ARM32_te_c __asm__ ("r2") = (word32*)L_AES_ARM32_te;
+    register word32* L_AES_ARM32_td_c __asm__ ("r3") = (word32*)L_AES_ARM32_td;
 #else
     register word32* L_AES_ARM32_te_c = (word32*)L_AES_ARM32_te;
     register word32* L_AES_ARM32_td_c = (word32*)L_AES_ARM32_td;
@@ -8637,7 +8635,7 @@ WC_OMIT_FRAME_POINTER void AES_invert_key(unsigned char* ks, word32 rounds)
         "add	r10, %[ks], %[rounds], lsl #4\n\t"
         "mov	r11, %[rounds]\n\t"
         "\n"
-    "L_AES_invert_key_loop_%=: \n\t"
+    "L_AES_invert_key_loop_%=:\n\t"
         "ldm	%[ks], {r2, r3, r4, r5}\n\t"
         "ldm	r10, {r6, r7, r8, r9}\n\t"
         "stm	r10, {r2, r3, r4, r5}\n\t"
@@ -8649,7 +8647,7 @@ WC_OMIT_FRAME_POINTER void AES_invert_key(unsigned char* ks, word32 rounds)
         "add	%[ks], %[ks], #16\n\t"
         "sub	r11, %[rounds], #1\n\t"
         "\n"
-    "L_AES_invert_key_mix_loop_%=: \n\t"
+    "L_AES_invert_key_mix_loop_%=:\n\t"
         "ldm	%[ks], {r2, r3, r4, r5}\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6)
@@ -8842,7 +8840,7 @@ WC_OMIT_FRAME_POINTER void AES_invert_key(unsigned char* ks, word32 rounds)
 }
 
 #endif /* HAVE_AES_DECRYPT */
-static const word32 L_AES_ARM32_rcon[] = {
+XALIGNED(8) static const word32 L_AES_ARM32_rcon[] = {
     0x01000000, 0x02000000, 0x04000000, 0x08000000,
     0x10000000, 0x20000000, 0x40000000, 0x80000000,
     0x1b000000, 0x36000000
@@ -8856,14 +8854,15 @@ WC_OMIT_FRAME_POINTER void AES_set_encrypt_key(const unsigned char* key_p,
 #else
 WC_OMIT_FRAME_POINTER void AES_set_encrypt_key(const unsigned char* key,
     word32 len, unsigned char* ks)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register const unsigned char* key asm ("r0") = (const unsigned char*)key_p;
-    register word32 len asm ("r1") = (word32)len_p;
-    register unsigned char* ks asm ("r2") = (unsigned char*)ks_p;
-    register word32* L_AES_ARM32_te_c asm ("r3") = (word32*)L_AES_ARM32_te;
-    register word32* L_AES_ARM32_rcon_c asm ("r12") =
+    register const unsigned char* key __asm__ ("r0") =
+        (const unsigned char*)key_p;
+    register word32 len __asm__ ("r1") = (word32)len_p;
+    register unsigned char* ks __asm__ ("r2") = (unsigned char*)ks_p;
+    register word32* L_AES_ARM32_te_c __asm__ ("r3") = (word32*)L_AES_ARM32_te;
+    register word32* L_AES_ARM32_rcon_c __asm__ ("r12") =
         (word32*)&L_AES_ARM32_rcon;
 #else
     register word32* L_AES_ARM32_te_c = (word32*)L_AES_ARM32_te;
@@ -8945,7 +8944,7 @@ WC_OMIT_FRAME_POINTER void AES_set_encrypt_key(const unsigned char* key,
         "sub	%[ks], %[ks], #16\n\t"
         "mov	r12, #6\n\t"
         "\n"
-    "L_AES_set_encrypt_key_loop_256_%=: \n\t"
+    "L_AES_set_encrypt_key_loop_256_%=:\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6)
         "lsl	r4, r7, #24\n\t"
@@ -9093,7 +9092,7 @@ WC_OMIT_FRAME_POINTER void AES_set_encrypt_key(const unsigned char* key,
         "sub	%[ks], %[ks], #16\n\t"
         "b	L_AES_set_encrypt_key_end_%=\n\t"
         "\n"
-    "L_AES_set_encrypt_key_start_192_%=: \n\t"
+    "L_AES_set_encrypt_key_start_192_%=:\n\t"
         "ldr	r4, [%[key]]\n\t"
         "ldr	r5, [%[key], #4]\n\t"
         "ldr	r6, [%[key], #8]\n\t"
@@ -9149,7 +9148,7 @@ WC_OMIT_FRAME_POINTER void AES_set_encrypt_key(const unsigned char* key,
         "mov	r7, %[len]\n\t"
         "mov	r12, #7\n\t"
         "\n"
-    "L_AES_set_encrypt_key_loop_192_%=: \n\t"
+    "L_AES_set_encrypt_key_loop_192_%=:\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6)
         "lsl	r0, r7, #24\n\t"
@@ -9248,7 +9247,7 @@ WC_OMIT_FRAME_POINTER void AES_set_encrypt_key(const unsigned char* key,
         "stm	%[ks], {r0, r1, r4, r5}\n\t"
         "b	L_AES_set_encrypt_key_end_%=\n\t"
         "\n"
-    "L_AES_set_encrypt_key_start_128_%=: \n\t"
+    "L_AES_set_encrypt_key_start_128_%=:\n\t"
         "ldr	r4, [%[key]]\n\t"
         "ldr	r5, [%[key], #4]\n\t"
         "ldr	r6, [%[key], #8]\n\t"
@@ -9283,7 +9282,7 @@ WC_OMIT_FRAME_POINTER void AES_set_encrypt_key(const unsigned char* key,
         "stm	%[ks], {r4, r5, r6, r7}\n\t"
         "mov	r12, #10\n\t"
         "\n"
-    "L_AES_set_encrypt_key_loop_128_%=: \n\t"
+    "L_AES_set_encrypt_key_loop_128_%=:\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6)
         "lsl	r4, r7, #24\n\t"
@@ -9333,7 +9332,7 @@ WC_OMIT_FRAME_POINTER void AES_set_encrypt_key(const unsigned char* key,
         "subs	r12, r12, #1\n\t"
         "bne	L_AES_set_encrypt_key_loop_128_%=\n\t"
         "\n"
-    "L_AES_set_encrypt_key_end_%=: \n\t"
+    "L_AES_set_encrypt_key_end_%=:\n\t"
         "pop	{%[L_AES_ARM32_rcon]}\n\t"
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [key] "+r" (key), [len] "+r" (len), [ks] "+r" (ks),
@@ -9359,18 +9358,18 @@ WC_OMIT_FRAME_POINTER void AES_encrypt_block(const word32* te_p, int nr_p,
 #else
 WC_OMIT_FRAME_POINTER void AES_encrypt_block(const word32* te, int nr, int len,
     const word32* ks)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register const word32* te asm ("r0") = (const word32*)te_p;
-    register int nr asm ("r1") = (int)nr_p;
-    register int len asm ("r2") = (int)len_p;
-    register const word32* ks asm ("r3") = (const word32*)ks_p;
+    register const word32* te __asm__ ("r0") = (const word32*)te_p;
+    register int nr __asm__ ("r1") = (int)nr_p;
+    register int len __asm__ ("r2") = (int)len_p;
+    register const word32* ks __asm__ ("r3") = (const word32*)ks_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
         "\n"
-    "L_AES_encrypt_block_nr_%=: \n\t"
+    "L_AES_encrypt_block_nr_%=:\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6)
         "lsl	r8, r5, #8\n\t"
@@ -10030,15 +10029,17 @@ WC_OMIT_FRAME_POINTER void AES_ECB_encrypt(const unsigned char* in_p,
 #else
 WC_OMIT_FRAME_POINTER void AES_ECB_encrypt(const unsigned char* in,
     unsigned char* out, unsigned long len, const unsigned char* ks, int nr)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register const unsigned char* in asm ("r0") = (const unsigned char*)in_p;
-    register unsigned char* out asm ("r1") = (unsigned char*)out_p;
-    register unsigned long len asm ("r2") = (unsigned long)len_p;
-    register const unsigned char* ks asm ("r3") = (const unsigned char*)ks_p;
-    register int nr asm ("r12") = (int)nr_p;
-    register word32* L_AES_ARM32_te_ecb_c asm ("lr") =
+    register const unsigned char* in __asm__ ("r0") =
+        (const unsigned char*)in_p;
+    register unsigned char* out __asm__ ("r1") = (unsigned char*)out_p;
+    register unsigned long len __asm__ ("r2") = (unsigned long)len_p;
+    register const unsigned char* ks __asm__ ("r3") =
+        (const unsigned char*)ks_p;
+    register int nr __asm__ ("r12") = (int)nr_p;
+    register word32* L_AES_ARM32_te_ecb_c __asm__ ("lr") =
         (word32*)L_AES_ARM32_te_ecb;
 #else
     register word32* L_AES_ARM32_te_ecb_c = (word32*)L_AES_ARM32_te_ecb;
@@ -10055,7 +10056,7 @@ WC_OMIT_FRAME_POINTER void AES_ECB_encrypt(const unsigned char* in,
         "cmp	r12, #12\n\t"
         "beq	L_AES_ECB_encrypt_start_block_192_%=\n\t"
         "\n"
-    "L_AES_ECB_encrypt_loop_block_256_%=: \n\t"
+    "L_AES_ECB_encrypt_loop_block_256_%=:\n\t"
         "ldr	r4, [lr]\n\t"
         "ldr	r5, [lr, #4]\n\t"
         "ldr	r6, [lr, #8]\n\t"
@@ -10095,7 +10096,7 @@ WC_OMIT_FRAME_POINTER void AES_ECB_encrypt(const unsigned char* in,
         "bl	AES_encrypt_block\n\t"
 #else
         "\n"
-    "L_AES_ECB_encrypt_block_nr_256_%=: \n\t"
+    "L_AES_ECB_encrypt_block_nr_256_%=:\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6)
         "lsl	r8, r5, #8\n\t"
@@ -10766,9 +10767,9 @@ WC_OMIT_FRAME_POINTER void AES_ECB_encrypt(const unsigned char* in,
         "bne	L_AES_ECB_encrypt_loop_block_256_%=\n\t"
         "b	L_AES_ECB_encrypt_end_%=\n\t"
         "\n"
-    "L_AES_ECB_encrypt_start_block_192_%=: \n\t"
+    "L_AES_ECB_encrypt_start_block_192_%=:\n\t"
         "\n"
-    "L_AES_ECB_encrypt_loop_block_192_%=: \n\t"
+    "L_AES_ECB_encrypt_loop_block_192_%=:\n\t"
         "ldr	r4, [lr]\n\t"
         "ldr	r5, [lr, #4]\n\t"
         "ldr	r6, [lr, #8]\n\t"
@@ -10808,7 +10809,7 @@ WC_OMIT_FRAME_POINTER void AES_ECB_encrypt(const unsigned char* in,
         "bl	AES_encrypt_block\n\t"
 #else
         "\n"
-    "L_AES_ECB_encrypt_block_nr_192_%=: \n\t"
+    "L_AES_ECB_encrypt_block_nr_192_%=:\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6)
         "lsl	r8, r5, #8\n\t"
@@ -11479,9 +11480,9 @@ WC_OMIT_FRAME_POINTER void AES_ECB_encrypt(const unsigned char* in,
         "bne	L_AES_ECB_encrypt_loop_block_192_%=\n\t"
         "b	L_AES_ECB_encrypt_end_%=\n\t"
         "\n"
-    "L_AES_ECB_encrypt_start_block_128_%=: \n\t"
+    "L_AES_ECB_encrypt_start_block_128_%=:\n\t"
         "\n"
-    "L_AES_ECB_encrypt_loop_block_128_%=: \n\t"
+    "L_AES_ECB_encrypt_loop_block_128_%=:\n\t"
         "ldr	r4, [lr]\n\t"
         "ldr	r5, [lr, #4]\n\t"
         "ldr	r6, [lr, #8]\n\t"
@@ -11521,7 +11522,7 @@ WC_OMIT_FRAME_POINTER void AES_ECB_encrypt(const unsigned char* in,
         "bl	AES_encrypt_block\n\t"
 #else
         "\n"
-    "L_AES_ECB_encrypt_block_nr_128_%=: \n\t"
+    "L_AES_ECB_encrypt_block_nr_128_%=:\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6)
         "lsl	r8, r5, #8\n\t"
@@ -12191,7 +12192,7 @@ WC_OMIT_FRAME_POINTER void AES_ECB_encrypt(const unsigned char* in,
         "add	%[out], %[out], #16\n\t"
         "bne	L_AES_ECB_encrypt_loop_block_128_%=\n\t"
         "\n"
-    "L_AES_ECB_encrypt_end_%=: \n\t"
+    "L_AES_ECB_encrypt_end_%=:\n\t"
         "pop	{%[ks]}\n\t"
         "pop	{%[nr], %[L_AES_ARM32_te_ecb]}\n\t"
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
@@ -12222,16 +12223,18 @@ WC_OMIT_FRAME_POINTER void AES_CBC_encrypt(const unsigned char* in_p,
 WC_OMIT_FRAME_POINTER void AES_CBC_encrypt(const unsigned char* in,
     unsigned char* out, unsigned long len, const unsigned char* ks, int nr,
     unsigned char* iv)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register const unsigned char* in asm ("r0") = (const unsigned char*)in_p;
-    register unsigned char* out asm ("r1") = (unsigned char*)out_p;
-    register unsigned long len asm ("r2") = (unsigned long)len_p;
-    register const unsigned char* ks asm ("r3") = (const unsigned char*)ks_p;
-    register int nr asm ("r12") = (int)nr_p;
-    register unsigned char* iv asm ("lr") = (unsigned char*)iv_p;
-    register word32* L_AES_ARM32_te_cbc_c asm ("r4") =
+    register const unsigned char* in __asm__ ("r0") =
+        (const unsigned char*)in_p;
+    register unsigned char* out __asm__ ("r1") = (unsigned char*)out_p;
+    register unsigned long len __asm__ ("r2") = (unsigned long)len_p;
+    register const unsigned char* ks __asm__ ("r3") =
+        (const unsigned char*)ks_p;
+    register int nr __asm__ ("r12") = (int)nr_p;
+    register unsigned char* iv __asm__ ("lr") = (unsigned char*)iv_p;
+    register word32* L_AES_ARM32_te_cbc_c __asm__ ("r4") =
         (word32*)L_AES_ARM32_te_cbc;
 #else
     register word32* L_AES_ARM32_te_cbc_c = (word32*)L_AES_ARM32_te_cbc;
@@ -12251,7 +12254,7 @@ WC_OMIT_FRAME_POINTER void AES_CBC_encrypt(const unsigned char* in,
         "cmp	r8, #12\n\t"
         "beq	L_AES_CBC_encrypt_start_block_192_%=\n\t"
         "\n"
-    "L_AES_CBC_encrypt_loop_block_256_%=: \n\t"
+    "L_AES_CBC_encrypt_loop_block_256_%=:\n\t"
         "ldr	r8, [lr]\n\t"
         "ldr	r9, [lr, #4]\n\t"
         "ldr	r10, [lr, #8]\n\t"
@@ -12295,7 +12298,7 @@ WC_OMIT_FRAME_POINTER void AES_CBC_encrypt(const unsigned char* in,
         "bl	AES_encrypt_block\n\t"
 #else
         "\n"
-    "L_AES_CBC_encrypt_block_nr_256_%=: \n\t"
+    "L_AES_CBC_encrypt_block_nr_256_%=:\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6)
         "lsl	r8, r5, #8\n\t"
@@ -12966,9 +12969,9 @@ WC_OMIT_FRAME_POINTER void AES_CBC_encrypt(const unsigned char* in,
         "bne	L_AES_CBC_encrypt_loop_block_256_%=\n\t"
         "b	L_AES_CBC_encrypt_end_%=\n\t"
         "\n"
-    "L_AES_CBC_encrypt_start_block_192_%=: \n\t"
+    "L_AES_CBC_encrypt_start_block_192_%=:\n\t"
         "\n"
-    "L_AES_CBC_encrypt_loop_block_192_%=: \n\t"
+    "L_AES_CBC_encrypt_loop_block_192_%=:\n\t"
         "ldr	r8, [lr]\n\t"
         "ldr	r9, [lr, #4]\n\t"
         "ldr	r10, [lr, #8]\n\t"
@@ -13012,7 +13015,7 @@ WC_OMIT_FRAME_POINTER void AES_CBC_encrypt(const unsigned char* in,
         "bl	AES_encrypt_block\n\t"
 #else
         "\n"
-    "L_AES_CBC_encrypt_block_nr_192_%=: \n\t"
+    "L_AES_CBC_encrypt_block_nr_192_%=:\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6)
         "lsl	r8, r5, #8\n\t"
@@ -13683,9 +13686,9 @@ WC_OMIT_FRAME_POINTER void AES_CBC_encrypt(const unsigned char* in,
         "bne	L_AES_CBC_encrypt_loop_block_192_%=\n\t"
         "b	L_AES_CBC_encrypt_end_%=\n\t"
         "\n"
-    "L_AES_CBC_encrypt_start_block_128_%=: \n\t"
+    "L_AES_CBC_encrypt_start_block_128_%=:\n\t"
         "\n"
-    "L_AES_CBC_encrypt_loop_block_128_%=: \n\t"
+    "L_AES_CBC_encrypt_loop_block_128_%=:\n\t"
         "ldr	r8, [lr]\n\t"
         "ldr	r9, [lr, #4]\n\t"
         "ldr	r10, [lr, #8]\n\t"
@@ -13729,7 +13732,7 @@ WC_OMIT_FRAME_POINTER void AES_CBC_encrypt(const unsigned char* in,
         "bl	AES_encrypt_block\n\t"
 #else
         "\n"
-    "L_AES_CBC_encrypt_block_nr_128_%=: \n\t"
+    "L_AES_CBC_encrypt_block_nr_128_%=:\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6)
         "lsl	r8, r5, #8\n\t"
@@ -14399,7 +14402,7 @@ WC_OMIT_FRAME_POINTER void AES_CBC_encrypt(const unsigned char* in,
         "add	%[out], %[out], #16\n\t"
         "bne	L_AES_CBC_encrypt_loop_block_128_%=\n\t"
         "\n"
-    "L_AES_CBC_encrypt_end_%=: \n\t"
+    "L_AES_CBC_encrypt_end_%=:\n\t"
         "pop	{%[ks], r9}\n\t"
         "stm	r9, {r4, r5, r6, r7}\n\t"
         "pop	{%[nr], %[iv]}\n\t"
@@ -14433,16 +14436,18 @@ WC_OMIT_FRAME_POINTER void AES_CTR_encrypt(const unsigned char* in_p,
 WC_OMIT_FRAME_POINTER void AES_CTR_encrypt(const unsigned char* in,
     unsigned char* out, unsigned long len, const unsigned char* ks, int nr,
     unsigned char* ctr)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register const unsigned char* in asm ("r0") = (const unsigned char*)in_p;
-    register unsigned char* out asm ("r1") = (unsigned char*)out_p;
-    register unsigned long len asm ("r2") = (unsigned long)len_p;
-    register const unsigned char* ks asm ("r3") = (const unsigned char*)ks_p;
-    register int nr asm ("r12") = (int)nr_p;
-    register unsigned char* ctr asm ("lr") = (unsigned char*)ctr_p;
-    register word32* L_AES_ARM32_te_ctr_c asm ("r4") =
+    register const unsigned char* in __asm__ ("r0") =
+        (const unsigned char*)in_p;
+    register unsigned char* out __asm__ ("r1") = (unsigned char*)out_p;
+    register unsigned long len __asm__ ("r2") = (unsigned long)len_p;
+    register const unsigned char* ks __asm__ ("r3") =
+        (const unsigned char*)ks_p;
+    register int nr __asm__ ("r12") = (int)nr_p;
+    register unsigned char* ctr __asm__ ("lr") = (unsigned char*)ctr_p;
+    register word32* L_AES_ARM32_te_ctr_c __asm__ ("r4") =
         (word32*)L_AES_ARM32_te_ctr;
 #else
     register word32* L_AES_ARM32_te_ctr_c = (word32*)L_AES_ARM32_te_ctr;
@@ -14486,7 +14491,7 @@ WC_OMIT_FRAME_POINTER void AES_CTR_encrypt(const unsigned char* in,
         "cmp	r12, #12\n\t"
         "beq	L_AES_CTR_encrypt_start_block_192_%=\n\t"
         "\n"
-    "L_AES_CTR_encrypt_loop_block_256_%=: \n\t"
+    "L_AES_CTR_encrypt_loop_block_256_%=:\n\t"
         "push	{r1, %[len], lr}\n\t"
         "ldr	lr, [sp, #16]\n\t"
         "adds	r11, r7, #1\n\t"
@@ -14505,7 +14510,7 @@ WC_OMIT_FRAME_POINTER void AES_CTR_encrypt(const unsigned char* in,
         "bl	AES_encrypt_block\n\t"
 #else
         "\n"
-    "L_AES_CTR_encrypt_block_nr_256_%=: \n\t"
+    "L_AES_CTR_encrypt_block_nr_256_%=:\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6)
         "lsl	r8, r5, #8\n\t"
@@ -15186,9 +15191,9 @@ WC_OMIT_FRAME_POINTER void AES_CTR_encrypt(const unsigned char* in,
         "bne	L_AES_CTR_encrypt_loop_block_256_%=\n\t"
         "b	L_AES_CTR_encrypt_end_%=\n\t"
         "\n"
-    "L_AES_CTR_encrypt_start_block_192_%=: \n\t"
+    "L_AES_CTR_encrypt_start_block_192_%=:\n\t"
         "\n"
-    "L_AES_CTR_encrypt_loop_block_192_%=: \n\t"
+    "L_AES_CTR_encrypt_loop_block_192_%=:\n\t"
         "push	{r1, %[len], lr}\n\t"
         "ldr	lr, [sp, #16]\n\t"
         "adds	r11, r7, #1\n\t"
@@ -15207,7 +15212,7 @@ WC_OMIT_FRAME_POINTER void AES_CTR_encrypt(const unsigned char* in,
         "bl	AES_encrypt_block\n\t"
 #else
         "\n"
-    "L_AES_CTR_encrypt_block_nr_192_%=: \n\t"
+    "L_AES_CTR_encrypt_block_nr_192_%=:\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6)
         "lsl	r8, r5, #8\n\t"
@@ -15888,9 +15893,9 @@ WC_OMIT_FRAME_POINTER void AES_CTR_encrypt(const unsigned char* in,
         "bne	L_AES_CTR_encrypt_loop_block_192_%=\n\t"
         "b	L_AES_CTR_encrypt_end_%=\n\t"
         "\n"
-    "L_AES_CTR_encrypt_start_block_128_%=: \n\t"
+    "L_AES_CTR_encrypt_start_block_128_%=:\n\t"
         "\n"
-    "L_AES_CTR_encrypt_loop_block_128_%=: \n\t"
+    "L_AES_CTR_encrypt_loop_block_128_%=:\n\t"
         "push	{r1, %[len], lr}\n\t"
         "ldr	lr, [sp, #16]\n\t"
         "adds	r11, r7, #1\n\t"
@@ -15909,7 +15914,7 @@ WC_OMIT_FRAME_POINTER void AES_CTR_encrypt(const unsigned char* in,
         "bl	AES_encrypt_block\n\t"
 #else
         "\n"
-    "L_AES_CTR_encrypt_block_nr_128_%=: \n\t"
+    "L_AES_CTR_encrypt_block_nr_128_%=:\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6)
         "lsl	r8, r5, #8\n\t"
@@ -16589,7 +16594,7 @@ WC_OMIT_FRAME_POINTER void AES_CTR_encrypt(const unsigned char* in,
         "add	%[out], %[out], #16\n\t"
         "bne	L_AES_CTR_encrypt_loop_block_128_%=\n\t"
         "\n"
-    "L_AES_CTR_encrypt_end_%=: \n\t"
+    "L_AES_CTR_encrypt_end_%=:\n\t"
         "pop	{%[ks], r8}\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6)
         "eor	r10, r4, r4, ror #16\n\t"
@@ -16644,17 +16649,17 @@ WC_OMIT_FRAME_POINTER void AES_decrypt_block(const word32* td_p, int nr_p,
 #else
 WC_OMIT_FRAME_POINTER void AES_decrypt_block(const word32* td, int nr,
     const byte* td4)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register const word32* td asm ("r0") = (const word32*)td_p;
-    register int nr asm ("r1") = (int)nr_p;
-    register const byte* td4 asm ("r2") = (const byte*)td4_p;
+    register const word32* td __asm__ ("r0") = (const word32*)td_p;
+    register int nr __asm__ ("r1") = (int)nr_p;
+    register const byte* td4 __asm__ ("r2") = (const byte*)td4_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
         "\n"
-    "L_AES_decrypt_block_nr_%=: \n\t"
+    "L_AES_decrypt_block_nr_%=:\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6)
         "lsl	r8, r7, #8\n\t"
@@ -17304,7 +17309,7 @@ WC_OMIT_FRAME_POINTER void AES_decrypt_block(const word32* td, int nr,
 static const word32* L_AES_ARM32_td_ecb = L_AES_ARM32_td_data;
 #if defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) || \
         defined(HAVE_AES_ECB)
-static const byte L_AES_ARM32_ecb_td4[] = {
+XALIGNED(4) static const word8 L_AES_ARM32_ecb_td4[] = {
     0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38,
     0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
     0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87,
@@ -17348,21 +17353,23 @@ WC_OMIT_FRAME_POINTER void AES_ECB_decrypt(const unsigned char* in_p,
 #else
 WC_OMIT_FRAME_POINTER void AES_ECB_decrypt(const unsigned char* in,
     unsigned char* out, unsigned long len, const unsigned char* ks, int nr)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register const unsigned char* in asm ("r0") = (const unsigned char*)in_p;
-    register unsigned char* out asm ("r1") = (unsigned char*)out_p;
-    register unsigned long len asm ("r2") = (unsigned long)len_p;
-    register const unsigned char* ks asm ("r3") = (const unsigned char*)ks_p;
-    register int nr asm ("r12") = (int)nr_p;
-    register word32* L_AES_ARM32_td_ecb_c asm ("lr") =
+    register const unsigned char* in __asm__ ("r0") =
+        (const unsigned char*)in_p;
+    register unsigned char* out __asm__ ("r1") = (unsigned char*)out_p;
+    register unsigned long len __asm__ ("r2") = (unsigned long)len_p;
+    register const unsigned char* ks __asm__ ("r3") =
+        (const unsigned char*)ks_p;
+    register int nr __asm__ ("r12") = (int)nr_p;
+    register word32* L_AES_ARM32_td_ecb_c __asm__ ("lr") =
         (word32*)L_AES_ARM32_td_ecb;
-    register byte* L_AES_ARM32_ecb_td4_c asm ("r4") =
-        (byte*)&L_AES_ARM32_ecb_td4;
+    register word8* L_AES_ARM32_ecb_td4_c __asm__ ("r4") =
+        (word8*)&L_AES_ARM32_ecb_td4;
 #else
     register word32* L_AES_ARM32_td_ecb_c = (word32*)L_AES_ARM32_td_ecb;
-    register byte* L_AES_ARM32_ecb_td4_c = (byte*)&L_AES_ARM32_ecb_td4;
+    register word8* L_AES_ARM32_ecb_td4_c = (word8*)&L_AES_ARM32_ecb_td4;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -17378,7 +17385,7 @@ WC_OMIT_FRAME_POINTER void AES_ECB_decrypt(const unsigned char* in,
         "cmp	r8, #12\n\t"
         "beq	L_AES_ECB_decrypt_start_block_192_%=\n\t"
         "\n"
-    "L_AES_ECB_decrypt_loop_block_256_%=: \n\t"
+    "L_AES_ECB_decrypt_loop_block_256_%=:\n\t"
         "ldr	r4, [lr]\n\t"
         "ldr	r5, [lr, #4]\n\t"
         "ldr	r6, [lr, #8]\n\t"
@@ -17418,7 +17425,7 @@ WC_OMIT_FRAME_POINTER void AES_ECB_decrypt(const unsigned char* in,
         "bl	AES_decrypt_block\n\t"
 #else
         "\n"
-    "L_AES_ECB_decrypt_block_nr_256_%=: \n\t"
+    "L_AES_ECB_decrypt_block_nr_256_%=:\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6)
         "lsl	r8, r7, #8\n\t"
@@ -18088,9 +18095,9 @@ WC_OMIT_FRAME_POINTER void AES_ECB_decrypt(const unsigned char* in,
         "bne	L_AES_ECB_decrypt_loop_block_256_%=\n\t"
         "b	L_AES_ECB_decrypt_end_%=\n\t"
         "\n"
-    "L_AES_ECB_decrypt_start_block_192_%=: \n\t"
+    "L_AES_ECB_decrypt_start_block_192_%=:\n\t"
         "\n"
-    "L_AES_ECB_decrypt_loop_block_192_%=: \n\t"
+    "L_AES_ECB_decrypt_loop_block_192_%=:\n\t"
         "ldr	r4, [lr]\n\t"
         "ldr	r5, [lr, #4]\n\t"
         "ldr	r6, [lr, #8]\n\t"
@@ -18130,7 +18137,7 @@ WC_OMIT_FRAME_POINTER void AES_ECB_decrypt(const unsigned char* in,
         "bl	AES_decrypt_block\n\t"
 #else
         "\n"
-    "L_AES_ECB_decrypt_block_nr_192_%=: \n\t"
+    "L_AES_ECB_decrypt_block_nr_192_%=:\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6)
         "lsl	r8, r7, #8\n\t"
@@ -18800,9 +18807,9 @@ WC_OMIT_FRAME_POINTER void AES_ECB_decrypt(const unsigned char* in,
         "bne	L_AES_ECB_decrypt_loop_block_192_%=\n\t"
         "b	L_AES_ECB_decrypt_end_%=\n\t"
         "\n"
-    "L_AES_ECB_decrypt_start_block_128_%=: \n\t"
+    "L_AES_ECB_decrypt_start_block_128_%=:\n\t"
         "\n"
-    "L_AES_ECB_decrypt_loop_block_128_%=: \n\t"
+    "L_AES_ECB_decrypt_loop_block_128_%=:\n\t"
         "ldr	r4, [lr]\n\t"
         "ldr	r5, [lr, #4]\n\t"
         "ldr	r6, [lr, #8]\n\t"
@@ -18842,7 +18849,7 @@ WC_OMIT_FRAME_POINTER void AES_ECB_decrypt(const unsigned char* in,
         "bl	AES_decrypt_block\n\t"
 #else
         "\n"
-    "L_AES_ECB_decrypt_block_nr_128_%=: \n\t"
+    "L_AES_ECB_decrypt_block_nr_128_%=:\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6)
         "lsl	r8, r7, #8\n\t"
@@ -19511,7 +19518,7 @@ WC_OMIT_FRAME_POINTER void AES_ECB_decrypt(const unsigned char* in,
         "add	%[out], %[out], #16\n\t"
         "bne	L_AES_ECB_decrypt_loop_block_128_%=\n\t"
         "\n"
-    "L_AES_ECB_decrypt_end_%=: \n\t"
+    "L_AES_ECB_decrypt_end_%=:\n\t"
         "pop	{%[nr], %[L_AES_ARM32_td_ecb]}\n\t"
         "pop	{%[L_AES_ARM32_ecb_td4]}\n\t"
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
@@ -19531,7 +19538,7 @@ WC_OMIT_FRAME_POINTER void AES_ECB_decrypt(const unsigned char* in,
 
 #endif /* WOLFSSL_AES_DIRECT || WOLFSSL_AES_COUNTER || defined(HAVE_AES_ECB) */
 #ifdef HAVE_AES_CBC
-static const byte L_AES_ARM32_cbc_td4[] = {
+XALIGNED(4) static const word8 L_AES_ARM32_cbc_td4[] = {
     0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38,
     0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
     0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87,
@@ -19577,22 +19584,24 @@ WC_OMIT_FRAME_POINTER void AES_CBC_decrypt(const unsigned char* in_p,
 WC_OMIT_FRAME_POINTER void AES_CBC_decrypt(const unsigned char* in,
     unsigned char* out, unsigned long len, const unsigned char* ks, int nr,
     unsigned char* iv)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register const unsigned char* in asm ("r0") = (const unsigned char*)in_p;
-    register unsigned char* out asm ("r1") = (unsigned char*)out_p;
-    register unsigned long len asm ("r2") = (unsigned long)len_p;
-    register const unsigned char* ks asm ("r3") = (const unsigned char*)ks_p;
-    register int nr asm ("r12") = (int)nr_p;
-    register unsigned char* iv asm ("lr") = (unsigned char*)iv_p;
-    register word32* L_AES_ARM32_td_ecb_c asm ("r4") =
+    register const unsigned char* in __asm__ ("r0") =
+        (const unsigned char*)in_p;
+    register unsigned char* out __asm__ ("r1") = (unsigned char*)out_p;
+    register unsigned long len __asm__ ("r2") = (unsigned long)len_p;
+    register const unsigned char* ks __asm__ ("r3") =
+        (const unsigned char*)ks_p;
+    register int nr __asm__ ("r12") = (int)nr_p;
+    register unsigned char* iv __asm__ ("lr") = (unsigned char*)iv_p;
+    register word32* L_AES_ARM32_td_ecb_c __asm__ ("r4") =
         (word32*)L_AES_ARM32_td_ecb;
-    register byte* L_AES_ARM32_cbc_td4_c asm ("r5") =
-        (byte*)&L_AES_ARM32_cbc_td4;
+    register word8* L_AES_ARM32_cbc_td4_c __asm__ ("r5") =
+        (word8*)&L_AES_ARM32_cbc_td4;
 #else
     register word32* L_AES_ARM32_td_ecb_c = (word32*)L_AES_ARM32_td_ecb;
-    register byte* L_AES_ARM32_cbc_td4_c = (byte*)&L_AES_ARM32_cbc_td4;
+    register word8* L_AES_ARM32_cbc_td4_c = (word8*)&L_AES_ARM32_cbc_td4;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -19610,7 +19619,7 @@ WC_OMIT_FRAME_POINTER void AES_CBC_decrypt(const unsigned char* in,
         "cmp	r8, #12\n\t"
         "beq	L_AES_CBC_decrypt_loop_block_192_%=\n\t"
         "\n"
-    "L_AES_CBC_decrypt_loop_block_256_%=: \n\t"
+    "L_AES_CBC_decrypt_loop_block_256_%=:\n\t"
         "push	{r1, r12, lr}\n\t"
         "ldr	r4, [lr]\n\t"
         "ldr	r5, [lr, #4]\n\t"
@@ -19663,7 +19672,7 @@ WC_OMIT_FRAME_POINTER void AES_CBC_decrypt(const unsigned char* in,
         "bl	AES_decrypt_block\n\t"
 #else
         "\n"
-    "L_AES_CBC_decrypt_block_nr_256_odd_%=: \n\t"
+    "L_AES_CBC_decrypt_block_nr_256_odd_%=:\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6)
         "lsl	r8, r7, #8\n\t"
@@ -20389,7 +20398,7 @@ WC_OMIT_FRAME_POINTER void AES_CBC_decrypt(const unsigned char* in,
         "bl	AES_decrypt_block\n\t"
 #else
         "\n"
-    "L_AES_CBC_decrypt_block_nr_256_even_%=: \n\t"
+    "L_AES_CBC_decrypt_block_nr_256_even_%=:\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6)
         "lsl	r8, r7, #8\n\t"
@@ -21077,7 +21086,7 @@ WC_OMIT_FRAME_POINTER void AES_CBC_decrypt(const unsigned char* in,
         "bne	L_AES_CBC_decrypt_loop_block_256_%=\n\t"
         "b	L_AES_CBC_decrypt_end_%=\n\t"
         "\n"
-    "L_AES_CBC_decrypt_loop_block_192_%=: \n\t"
+    "L_AES_CBC_decrypt_loop_block_192_%=:\n\t"
         "push	{r1, r12, lr}\n\t"
         "ldr	r4, [lr]\n\t"
         "ldr	r5, [lr, #4]\n\t"
@@ -21130,7 +21139,7 @@ WC_OMIT_FRAME_POINTER void AES_CBC_decrypt(const unsigned char* in,
         "bl	AES_decrypt_block\n\t"
 #else
         "\n"
-    "L_AES_CBC_decrypt_block_nr_192_odd_%=: \n\t"
+    "L_AES_CBC_decrypt_block_nr_192_odd_%=:\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6)
         "lsl	r8, r7, #8\n\t"
@@ -21856,7 +21865,7 @@ WC_OMIT_FRAME_POINTER void AES_CBC_decrypt(const unsigned char* in,
         "bl	AES_decrypt_block\n\t"
 #else
         "\n"
-    "L_AES_CBC_decrypt_block_nr_192_even_%=: \n\t"
+    "L_AES_CBC_decrypt_block_nr_192_even_%=:\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6)
         "lsl	r8, r7, #8\n\t"
@@ -22544,7 +22553,7 @@ WC_OMIT_FRAME_POINTER void AES_CBC_decrypt(const unsigned char* in,
         "bne	L_AES_CBC_decrypt_loop_block_192_%=\n\t"
         "b	L_AES_CBC_decrypt_end_%=\n\t"
         "\n"
-    "L_AES_CBC_decrypt_loop_block_128_%=: \n\t"
+    "L_AES_CBC_decrypt_loop_block_128_%=:\n\t"
         "push	{r1, r12, lr}\n\t"
         "ldr	r4, [lr]\n\t"
         "ldr	r5, [lr, #4]\n\t"
@@ -22597,7 +22606,7 @@ WC_OMIT_FRAME_POINTER void AES_CBC_decrypt(const unsigned char* in,
         "bl	AES_decrypt_block\n\t"
 #else
         "\n"
-    "L_AES_CBC_decrypt_block_nr_128_odd_%=: \n\t"
+    "L_AES_CBC_decrypt_block_nr_128_odd_%=:\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6)
         "lsl	r8, r7, #8\n\t"
@@ -23323,7 +23332,7 @@ WC_OMIT_FRAME_POINTER void AES_CBC_decrypt(const unsigned char* in,
         "bl	AES_decrypt_block\n\t"
 #else
         "\n"
-    "L_AES_CBC_decrypt_block_nr_128_even_%=: \n\t"
+    "L_AES_CBC_decrypt_block_nr_128_even_%=:\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6)
         "lsl	r8, r7, #8\n\t"
@@ -24011,7 +24020,7 @@ WC_OMIT_FRAME_POINTER void AES_CBC_decrypt(const unsigned char* in,
         "bne	L_AES_CBC_decrypt_loop_block_128_%=\n\t"
         "b	L_AES_CBC_decrypt_end_%=\n\t"
         "\n"
-    "L_AES_CBC_decrypt_end_odd_%=: \n\t"
+    "L_AES_CBC_decrypt_end_odd_%=:\n\t"
         "ldr	r4, [sp, #4]\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
         "ldr	r8, [r4, #16]\n\t"
@@ -24037,7 +24046,7 @@ WC_OMIT_FRAME_POINTER void AES_CBC_decrypt(const unsigned char* in,
         "strd	r10, r11, [r4, #8]\n\t"
 #endif
         "\n"
-    "L_AES_CBC_decrypt_end_%=: \n\t"
+    "L_AES_CBC_decrypt_end_%=:\n\t"
         "pop	{%[ks]-r4}\n\t"
         "pop	{%[nr], %[iv]}\n\t"
         "pop	{%[L_AES_ARM32_td_ecb], %[L_AES_ARM32_cbc_td4]}\n\t"
@@ -24063,7 +24072,7 @@ WC_OMIT_FRAME_POINTER void AES_CBC_decrypt(const unsigned char* in,
         * HAVE_AES_ECB */
 #endif /* HAVE_AES_DECRYPT */
 #ifdef HAVE_AESGCM
-static const word32 L_GCM_gmult_len_r[] = {
+XALIGNED(8) static const word32 L_GCM_gmult_len_r[] = {
     0x00000000, 0x1c200000, 0x38400000, 0x24600000,
     0x70800000, 0x6ca00000, 0x48c00000, 0x54e00000,
     0xe1000000, 0xfd200000, 0xd9400000, 0xc5600000,
@@ -24078,15 +24087,16 @@ WC_OMIT_FRAME_POINTER void GCM_gmult_len(unsigned char* x_p,
 #else
 WC_OMIT_FRAME_POINTER void GCM_gmult_len(unsigned char* x,
     const unsigned char** m, const unsigned char* data, unsigned long len)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register unsigned char* x asm ("r0") = (unsigned char*)x_p;
-    register const unsigned char** m asm ("r1") = (const unsigned char**)m_p;
-    register const unsigned char* data asm ("r2") =
+    register unsigned char* x __asm__ ("r0") = (unsigned char*)x_p;
+    register const unsigned char** m __asm__ ("r1") =
+        (const unsigned char**)m_p;
+    register const unsigned char* data __asm__ ("r2") =
         (const unsigned char*)data_p;
-    register unsigned long len asm ("r3") = (unsigned long)len_p;
-    register word32* L_GCM_gmult_len_r_c asm ("r12") =
+    register unsigned long len __asm__ ("r3") = (unsigned long)len_p;
+    register word32* L_GCM_gmult_len_r_c __asm__ ("r12") =
         (word32*)&L_GCM_gmult_len_r;
 #else
     register word32* L_GCM_gmult_len_r_c = (word32*)&L_GCM_gmult_len_r;
@@ -24096,7 +24106,7 @@ WC_OMIT_FRAME_POINTER void GCM_gmult_len(unsigned char* x,
         "push	{%[L_GCM_gmult_len_r]}\n\t"
         "mov	lr, %[L_GCM_gmult_len_r]\n\t"
         "\n"
-    "L_GCM_gmult_len_start_block_%=: \n\t"
+    "L_GCM_gmult_len_start_block_%=:\n\t"
         "push	{r3}\n\t"
         "ldr	r12, [r0, #12]\n\t"
         "ldr	%[len], [r2, #12]\n\t"
@@ -24692,16 +24702,18 @@ WC_OMIT_FRAME_POINTER void AES_GCM_encrypt(const unsigned char* in_p,
 WC_OMIT_FRAME_POINTER void AES_GCM_encrypt(const unsigned char* in,
     unsigned char* out, unsigned long len, const unsigned char* ks, int nr,
     unsigned char* ctr)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register const unsigned char* in asm ("r0") = (const unsigned char*)in_p;
-    register unsigned char* out asm ("r1") = (unsigned char*)out_p;
-    register unsigned long len asm ("r2") = (unsigned long)len_p;
-    register const unsigned char* ks asm ("r3") = (const unsigned char*)ks_p;
-    register int nr asm ("r12") = (int)nr_p;
-    register unsigned char* ctr asm ("lr") = (unsigned char*)ctr_p;
-    register word32* L_AES_ARM32_te_gcm_c asm ("r4") =
+    register const unsigned char* in __asm__ ("r0") =
+        (const unsigned char*)in_p;
+    register unsigned char* out __asm__ ("r1") = (unsigned char*)out_p;
+    register unsigned long len __asm__ ("r2") = (unsigned long)len_p;
+    register const unsigned char* ks __asm__ ("r3") =
+        (const unsigned char*)ks_p;
+    register int nr __asm__ ("r12") = (int)nr_p;
+    register unsigned char* ctr __asm__ ("lr") = (unsigned char*)ctr_p;
+    register word32* L_AES_ARM32_te_gcm_c __asm__ ("r4") =
         (word32*)L_AES_ARM32_te_gcm;
 #else
     register word32* L_AES_ARM32_te_gcm_c = (word32*)L_AES_ARM32_te_gcm;
@@ -24745,7 +24757,7 @@ WC_OMIT_FRAME_POINTER void AES_GCM_encrypt(const unsigned char* in,
         "cmp	r12, #12\n\t"
         "beq	L_AES_GCM_encrypt_start_block_192_%=\n\t"
         "\n"
-    "L_AES_GCM_encrypt_loop_block_256_%=: \n\t"
+    "L_AES_GCM_encrypt_loop_block_256_%=:\n\t"
         "push	{r1, %[len], lr}\n\t"
         "ldr	lr, [sp, #16]\n\t"
         "add	r7, r7, #1\n\t"
@@ -24761,7 +24773,7 @@ WC_OMIT_FRAME_POINTER void AES_GCM_encrypt(const unsigned char* in,
         "bl	AES_encrypt_block\n\t"
 #else
         "\n"
-    "L_AES_GCM_encrypt_block_nr_256_%=: \n\t"
+    "L_AES_GCM_encrypt_block_nr_256_%=:\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6)
         "lsl	r8, r5, #8\n\t"
@@ -25442,9 +25454,9 @@ WC_OMIT_FRAME_POINTER void AES_GCM_encrypt(const unsigned char* in,
         "bne	L_AES_GCM_encrypt_loop_block_256_%=\n\t"
         "b	L_AES_GCM_encrypt_end_%=\n\t"
         "\n"
-    "L_AES_GCM_encrypt_start_block_192_%=: \n\t"
+    "L_AES_GCM_encrypt_start_block_192_%=:\n\t"
         "\n"
-    "L_AES_GCM_encrypt_loop_block_192_%=: \n\t"
+    "L_AES_GCM_encrypt_loop_block_192_%=:\n\t"
         "push	{r1, %[len], lr}\n\t"
         "ldr	lr, [sp, #16]\n\t"
         "add	r7, r7, #1\n\t"
@@ -25460,7 +25472,7 @@ WC_OMIT_FRAME_POINTER void AES_GCM_encrypt(const unsigned char* in,
         "bl	AES_encrypt_block\n\t"
 #else
         "\n"
-    "L_AES_GCM_encrypt_block_nr_192_%=: \n\t"
+    "L_AES_GCM_encrypt_block_nr_192_%=:\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6)
         "lsl	r8, r5, #8\n\t"
@@ -26141,9 +26153,9 @@ WC_OMIT_FRAME_POINTER void AES_GCM_encrypt(const unsigned char* in,
         "bne	L_AES_GCM_encrypt_loop_block_192_%=\n\t"
         "b	L_AES_GCM_encrypt_end_%=\n\t"
         "\n"
-    "L_AES_GCM_encrypt_start_block_128_%=: \n\t"
+    "L_AES_GCM_encrypt_start_block_128_%=:\n\t"
         "\n"
-    "L_AES_GCM_encrypt_loop_block_128_%=: \n\t"
+    "L_AES_GCM_encrypt_loop_block_128_%=:\n\t"
         "push	{r1, %[len], lr}\n\t"
         "ldr	lr, [sp, #16]\n\t"
         "add	r7, r7, #1\n\t"
@@ -26159,7 +26171,7 @@ WC_OMIT_FRAME_POINTER void AES_GCM_encrypt(const unsigned char* in,
         "bl	AES_encrypt_block\n\t"
 #else
         "\n"
-    "L_AES_GCM_encrypt_block_nr_128_%=: \n\t"
+    "L_AES_GCM_encrypt_block_nr_128_%=:\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6)
         "lsl	r8, r5, #8\n\t"
@@ -26839,7 +26851,7 @@ WC_OMIT_FRAME_POINTER void AES_GCM_encrypt(const unsigned char* in,
         "add	%[out], %[out], #16\n\t"
         "bne	L_AES_GCM_encrypt_loop_block_128_%=\n\t"
         "\n"
-    "L_AES_GCM_encrypt_end_%=: \n\t"
+    "L_AES_GCM_encrypt_end_%=:\n\t"
         "pop	{%[ks], r8}\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6)
         "eor	r10, r4, r4, ror #16\n\t"
diff --git a/wolfcrypt/src/port/arm/armv8-32-chacha-asm.S b/wolfcrypt/src/port/arm/armv8-32-chacha-asm.S
index c740134a16e..a342fad9216 100644
--- a/wolfcrypt/src/port/arm/armv8-32-chacha-asm.S
+++ b/wolfcrypt/src/port/arm/armv8-32-chacha-asm.S
@@ -51,19 +51,22 @@ wc_chacha_setiv:
 	pop	{r4, pc}
 	.size	wc_chacha_setiv,.-wc_chacha_setiv
 #ifdef WOLFSSL_ARMASM_NO_NEON
+#ifndef __APPLE__
 	.text
 	.type	L_chacha_arm32_constants, %object
 	.size	L_chacha_arm32_constants, 32
-	.align	4
+#else
+	.section	__DATA,__data
+#endif /* __APPLE__ */
+	# 8-byte aligned, 64-bit aligned
+#ifndef __APPLE__
+	.align	3
+#else
+	.p2align	3
+#endif /* __APPLE__ */
 L_chacha_arm32_constants:
-	.word	0x61707865
-	.word	0x3120646e
-	.word	0x79622d36
-	.word	0x6b206574
-	.word	0x61707865
-	.word	0x3320646e
-	.word	0x79622d32
-	.word	0x6b206574
+	.long	0x61707865,0x3120646e,0x79622d36,0x6b206574
+	.long	0x61707865,0x3320646e,0x79622d32,0x6b206574
 	.text
 	.align	4
 	.globl	wc_chacha_setkey
@@ -969,8 +972,8 @@ L_chacha_crypt_bytes_arm32_round_start_128:
 	vext.8	q6, q6, q6, #8
 	bne	L_chacha_crypt_bytes_arm32_round_start_128
 	# Add back state, XOR in message and store (load next block)
-	vld1.8	{q8, q9}, [r2]!
-	vld1.8	{q10, q11}, [r2]!
+	vld1.8	{q8-q9}, [r2]!
+	vld1.8	{q10-q11}, [r2]!
 	vadd.i32	q0, q0, q12
 	vadd.i32	q1, q1, q13
 	vadd.i32	q2, q2, q14
@@ -979,10 +982,10 @@ L_chacha_crypt_bytes_arm32_round_start_128:
 	veor	q1, q1, q9
 	veor	q2, q2, q10
 	veor	q3, q3, q11
-	vld1.8	{q8, q9}, [r2]!
-	vld1.8	{q10, q11}, [r2]!
-	vst1.8	{q0, q1}, [r1]!
-	vst1.8	{q2, q3}, [r1]!
+	vld1.8	{q8-q9}, [r2]!
+	vld1.8	{q10-q11}, [r2]!
+	vst1.8	{q0-q1}, [r1]!
+	vst1.8	{q2-q3}, [r1]!
 	veor	q0, q0, q0
 	mov	r12, #1
 	vmov.i32	d0[0], r12
@@ -995,8 +998,8 @@ L_chacha_crypt_bytes_arm32_round_start_128:
 	veor	q5, q5, q9
 	veor	q6, q6, q10
 	veor	q7, q7, q11
-	vst1.8	{q4, q5}, [r1]!
-	vst1.8	{q6, q7}, [r1]!
+	vst1.8	{q4-q5}, [r1]!
+	vst1.8	{q6-q7}, [r1]!
 	vadd.i32	q15, q15, q0
 	sub	r3, r3, #0x80
 	# Done 128-byte block
@@ -1075,14 +1078,14 @@ L_chacha_crypt_bytes_arm32_round_64:
 	vadd.i32	q15, q15, q9
 	blt	L_chacha_crypt_bytes_arm32_lt_64
 	# Encipher 64 bytes
-	vld1.8	{q4, q5}, [r2]!
-	vld1.8	{q6, q7}, [r2]!
+	vld1.8	{q4-q5}, [r2]!
+	vld1.8	{q6-q7}, [r2]!
 	veor	q4, q4, q0
 	veor	q5, q5, q1
 	veor	q6, q6, q2
 	veor	q7, q7, q3
-	vst1.8	{q4, q5}, [r1]!
-	vst1.8	{q6, q7}, [r1]!
+	vst1.8	{q4-q5}, [r1]!
+	vst1.8	{q6-q7}, [r1]!
 	# Check for more bytes to be enciphered
 	subs	r3, r3, #0x40
 	bne	L_chacha_crypt_bytes_arm32_loop_64
@@ -1097,10 +1100,10 @@ L_chacha_crypt_bytes_arm32_lt_64:
 	# Encipher 32 bytes
 	cmp	r3, #32
 	blt	L_chacha_crypt_bytes_arm32_lt_32
-	vld1.8	{q4, q5}, [r2]!
+	vld1.8	{q4-q5}, [r2]!
 	veor	q4, q4, q0
 	veor	q5, q5, q1
-	vst1.8	{q4, q5}, [r1]!
+	vst1.8	{q4-q5}, [r1]!
 	subs	r3, r3, #32
 	vmov	q0, q2
 	vmov	q1, q3
@@ -1153,19 +1156,22 @@ L_chacha_crypt_bytes_arm32_done_all:
 	vpop	{d8-d15}
 	pop	{r4, r5, r6, r7, r8, r9, r10, r11, pc}
 	.size	wc_chacha_crypt_bytes,.-wc_chacha_crypt_bytes
+#ifndef __APPLE__
 	.text
 	.type	L_chacha_setkey_arm32_constant, %object
 	.size	L_chacha_setkey_arm32_constant, 32
-	.align	4
+#else
+	.section	__DATA,__data
+#endif /* __APPLE__ */
+	# 8-byte aligned, 64-bit aligned
+#ifndef __APPLE__
+	.align	3
+#else
+	.p2align	3
+#endif /* __APPLE__ */
 L_chacha_setkey_arm32_constant:
-	.word	0x61707865
-	.word	0x3120646e
-	.word	0x79622d36
-	.word	0x6b206574
-	.word	0x61707865
-	.word	0x3320646e
-	.word	0x79622d32
-	.word	0x6b206574
+	.long	0x61707865,0x3120646e,0x79622d36,0x6b206574
+	.long	0x61707865,0x3320646e,0x79622d32,0x6b206574
 	.text
 	.align	4
 	.globl	wc_chacha_setkey
@@ -1180,7 +1186,7 @@ wc_chacha_setkey:
 #ifdef BIG_ENDIAN_ORDER
 	vrev32.16	q1, q1
 #endif /* BIG_ENDIAN_ORDER */
-	vstm	r0!, {q0, q1}
+	vstm	r0!, {q0-q1}
 	beq	L_chacha_setkey_arm32_done
 	vld1.8	{q1}, [r1]
 #ifdef BIG_ENDIAN_ORDER
diff --git a/wolfcrypt/src/port/arm/armv8-32-chacha-asm_c.c b/wolfcrypt/src/port/arm/armv8-32-chacha-asm_c.c
index 323e93ff329..47e2562e5f1 100644
--- a/wolfcrypt/src/port/arm/armv8-32-chacha-asm_c.c
+++ b/wolfcrypt/src/port/arm/armv8-32-chacha-asm_c.c
@@ -30,8 +30,6 @@
 
 #ifdef WOLFSSL_ARMASM
 #if !defined(__aarch64__) && !defined(WOLFSSL_ARMASM_THUMB2)
-#include <stdint.h>
-#include <wolfssl/wolfcrypt/libwolfssl_sources.h>
 #ifdef WOLFSSL_ARMASM_INLINE
 
 #ifdef __IAR_SYSTEMS_ICC__
@@ -58,12 +56,12 @@ WC_OMIT_FRAME_POINTER void wc_chacha_setiv(word32* x_p, const byte* iv_p,
 #else
 WC_OMIT_FRAME_POINTER void wc_chacha_setiv(word32* x, const byte* iv,
     word32 counter)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register word32* x asm ("r0") = (word32*)x_p;
-    register const byte* iv asm ("r1") = (const byte*)iv_p;
-    register word32 counter asm ("r2") = (word32)counter_p;
+    register word32* x __asm__ ("r0") = (word32*)x_p;
+    register const byte* iv __asm__ ("r1") = (const byte*)iv_p;
+    register word32 counter __asm__ ("r2") = (word32)counter_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -90,7 +88,7 @@ WC_OMIT_FRAME_POINTER void wc_chacha_setiv(word32* x, const byte* iv,
 }
 
 #ifdef WOLFSSL_ARMASM_NO_NEON
-static const word32 L_chacha_arm32_constants[] = {
+XALIGNED(8) static const word32 L_chacha_arm32_constants[] = {
     0x61707865, 0x3120646e, 0x79622d36, 0x6b206574,
     0x61707865, 0x3320646e, 0x79622d32, 0x6b206574,
 };
@@ -101,13 +99,13 @@ WC_OMIT_FRAME_POINTER void wc_chacha_setkey(word32* x_p, const byte* key_p,
 #else
 WC_OMIT_FRAME_POINTER void wc_chacha_setkey(word32* x, const byte* key,
     word32 keySz)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register word32* x asm ("r0") = (word32*)x_p;
-    register const byte* key asm ("r1") = (const byte*)key_p;
-    register word32 keySz asm ("r2") = (word32)keySz_p;
-    register word32* L_chacha_arm32_constants_c asm ("r3") =
+    register word32* x __asm__ ("r0") = (word32*)x_p;
+    register const byte* key __asm__ ("r1") = (const byte*)key_p;
+    register word32 keySz __asm__ ("r2") = (word32)keySz_p;
+    register word32* L_chacha_arm32_constants_c __asm__ ("r3") =
         (word32*)&L_chacha_arm32_constants;
 #else
     register word32* L_chacha_arm32_constants_c =
@@ -142,7 +140,7 @@ WC_OMIT_FRAME_POINTER void wc_chacha_setkey(word32* x, const byte* key,
         "ldr	r12, [%[key], #8]\n\t"
         "ldr	lr, [%[key], #12]\n\t"
         "\n"
-    "L_chacha_arm32_setkey_same_key_bytes_%=: \n\t"
+    "L_chacha_arm32_setkey_same_key_bytes_%=:\n\t"
         "stm	%[x], {r4, r5, r12, lr}\n\t"
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [x] "+r" (x), [key] "+r" (key), [keySz] "+r" (keySz),
@@ -163,13 +161,13 @@ WC_OMIT_FRAME_POINTER void wc_chacha_crypt_bytes(ChaCha* ctx_p, byte* c_p,
 #else
 WC_OMIT_FRAME_POINTER void wc_chacha_crypt_bytes(ChaCha* ctx, byte* c,
     const byte* m, word32 len)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register ChaCha* ctx asm ("r0") = (ChaCha*)ctx_p;
-    register byte* c asm ("r1") = (byte*)c_p;
-    register const byte* m asm ("r2") = (const byte*)m_p;
-    register word32 len asm ("r3") = (word32)len_p;
+    register ChaCha* ctx __asm__ ("r0") = (ChaCha*)ctx_p;
+    register byte* c __asm__ ("r1") = (byte*)c_p;
+    register const byte* m __asm__ ("r2") = (const byte*)m_p;
+    register word32 len __asm__ ("r3") = (word32)len_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -188,7 +186,7 @@ WC_OMIT_FRAME_POINTER void wc_chacha_crypt_bytes(ChaCha* ctx, byte* c,
         "strd	%[m], %[len], [sp, #40]\n\t"
 #endif
         "\n"
-    "L_chacha_arm32_crypt_block_%=: \n\t"
+    "L_chacha_arm32_crypt_block_%=:\n\t"
         /* Put x[12]..x[15] onto stack. */
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
         "ldr	r4, [lr, #48]\n\t"
@@ -220,7 +218,7 @@ WC_OMIT_FRAME_POINTER void wc_chacha_crypt_bytes(ChaCha* ctx, byte* c,
         "mov	lr, #10\n\t"
         "str	lr, [sp, #48]\n\t"
         "\n"
-    "L_chacha_arm32_crypt_loop_%=: \n\t"
+    "L_chacha_arm32_crypt_loop_%=:\n\t"
         /* 0, 4,  8, 12 */
         /* 1, 5,  9, 13 */
         "ldr	lr, [sp, #20]\n\t"
@@ -449,7 +447,7 @@ WC_OMIT_FRAME_POINTER void wc_chacha_crypt_bytes(ChaCha* ctx, byte* c,
         "bne	L_chacha_arm32_crypt_block_%=\n\t"
         "b	L_chacha_arm32_crypt_done_%=\n\t"
         "\n"
-    "L_chacha_arm32_crypt_lt_block_%=: \n\t"
+    "L_chacha_arm32_crypt_lt_block_%=:\n\t"
         /* Store in over field of ChaCha. */
         "ldr	lr, [sp, #32]\n\t"
         "add	r12, lr, #0x44\n\t"
@@ -467,7 +465,7 @@ WC_OMIT_FRAME_POINTER void wc_chacha_crypt_bytes(ChaCha* ctx, byte* c,
         "str	r12, [lr, #64]\n\t"
         "add	lr, lr, #0x44\n\t"
         "\n"
-    "L_chacha_arm32_crypt_16byte_loop_%=: \n\t"
+    "L_chacha_arm32_crypt_16byte_loop_%=:\n\t"
         "cmp	%[len], #16\n\t"
         "blt	L_chacha_arm32_crypt_word_loop_%=\n\t"
         /* 16 bytes of state XORed into message. */
@@ -490,7 +488,7 @@ WC_OMIT_FRAME_POINTER void wc_chacha_crypt_bytes(ChaCha* ctx, byte* c,
         "add	%[c], %[c], #16\n\t"
         "b	L_chacha_arm32_crypt_16byte_loop_%=\n\t"
         "\n"
-    "L_chacha_arm32_crypt_word_loop_%=: \n\t"
+    "L_chacha_arm32_crypt_word_loop_%=:\n\t"
         "cmp	%[len], #4\n\t"
         "blt	L_chacha_arm32_crypt_byte_start_%=\n\t"
         /* 4 bytes of state XORed into message. */
@@ -505,10 +503,10 @@ WC_OMIT_FRAME_POINTER void wc_chacha_crypt_bytes(ChaCha* ctx, byte* c,
         "add	%[c], %[c], #4\n\t"
         "b	L_chacha_arm32_crypt_word_loop_%=\n\t"
         "\n"
-    "L_chacha_arm32_crypt_byte_start_%=: \n\t"
+    "L_chacha_arm32_crypt_byte_start_%=:\n\t"
         "ldr	r4, [lr]\n\t"
         "\n"
-    "L_chacha_arm32_crypt_byte_loop_%=: \n\t"
+    "L_chacha_arm32_crypt_byte_loop_%=:\n\t"
         "ldrb	r8, [%[m]]\n\t"
         "eor	r8, r8, r4\n\t"
         "subs	%[len], %[len], #1\n\t"
@@ -519,7 +517,7 @@ WC_OMIT_FRAME_POINTER void wc_chacha_crypt_bytes(ChaCha* ctx, byte* c,
         "add	%[c], %[c], #1\n\t"
         "b	L_chacha_arm32_crypt_byte_loop_%=\n\t"
         "\n"
-    "L_chacha_arm32_crypt_done_%=: \n\t"
+    "L_chacha_arm32_crypt_done_%=:\n\t"
         "add	sp, sp, #52\n\t"
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [ctx] "+r" (ctx), [c] "+r" (c), [m] "+r" (m), [len] "+r" (len)
@@ -539,18 +537,18 @@ WC_OMIT_FRAME_POINTER void wc_chacha_use_over(byte* over_p, byte* output_p,
 #else
 WC_OMIT_FRAME_POINTER void wc_chacha_use_over(byte* over, byte* output,
     const byte* input, word32 len)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register byte* over asm ("r0") = (byte*)over_p;
-    register byte* output asm ("r1") = (byte*)output_p;
-    register const byte* input asm ("r2") = (const byte*)input_p;
-    register word32 len asm ("r3") = (word32)len_p;
+    register byte* over __asm__ ("r0") = (byte*)over_p;
+    register byte* output __asm__ ("r1") = (byte*)output_p;
+    register const byte* input __asm__ ("r2") = (const byte*)input_p;
+    register word32 len __asm__ ("r3") = (word32)len_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
         "\n"
-    "L_chacha_arm32_over_16byte_loop_%=: \n\t"
+    "L_chacha_arm32_over_16byte_loop_%=:\n\t"
         "cmp	%[len], #16\n\t"
         "blt	L_chacha_arm32_over_word_loop_%=\n\t"
         /* 16 bytes of state XORed into message. */
@@ -577,7 +575,7 @@ WC_OMIT_FRAME_POINTER void wc_chacha_use_over(byte* over, byte* output,
         "add	%[output], %[output], #16\n\t"
         "b	L_chacha_arm32_over_16byte_loop_%=\n\t"
         "\n"
-    "L_chacha_arm32_over_word_loop_%=: \n\t"
+    "L_chacha_arm32_over_word_loop_%=:\n\t"
         "cmp	%[len], #4\n\t"
         "blt	L_chacha_arm32_over_byte_loop_%=\n\t"
         /* 4 bytes of state XORed into message. */
@@ -592,7 +590,7 @@ WC_OMIT_FRAME_POINTER void wc_chacha_use_over(byte* over, byte* output,
         "add	%[output], %[output], #4\n\t"
         "b	L_chacha_arm32_over_word_loop_%=\n\t"
         "\n"
-    "L_chacha_arm32_over_byte_loop_%=: \n\t"
+    "L_chacha_arm32_over_byte_loop_%=:\n\t"
         /* 4 bytes of state XORed into message. */
         "ldrb	r12, [%[over]]\n\t"
         "ldrb	r6, [%[input]]\n\t"
@@ -605,7 +603,7 @@ WC_OMIT_FRAME_POINTER void wc_chacha_use_over(byte* over, byte* output,
         "add	%[output], %[output], #1\n\t"
         "b	L_chacha_arm32_over_byte_loop_%=\n\t"
         "\n"
-    "L_chacha_arm32_over_done_%=: \n\t"
+    "L_chacha_arm32_over_done_%=:\n\t"
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [over] "+r" (over), [output] "+r" (output), [input] "+r" (input),
           [len] "+r" (len)
@@ -627,13 +625,13 @@ WC_OMIT_FRAME_POINTER void wc_chacha_crypt_bytes(ChaCha* ctx_p, byte* c_p,
 #else
 WC_OMIT_FRAME_POINTER void wc_chacha_crypt_bytes(ChaCha* ctx, byte* c,
     const byte* m, word32 len)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register ChaCha* ctx asm ("r0") = (ChaCha*)ctx_p;
-    register byte* c asm ("r1") = (byte*)c_p;
-    register const byte* m asm ("r2") = (const byte*)m_p;
-    register word32 len asm ("r3") = (word32)len_p;
+    register ChaCha* ctx __asm__ ("r0") = (ChaCha*)ctx_p;
+    register byte* c __asm__ ("r1") = (byte*)c_p;
+    register const byte* m __asm__ ("r2") = (const byte*)m_p;
+    register word32 len __asm__ ("r3") = (word32)len_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -644,7 +642,7 @@ WC_OMIT_FRAME_POINTER void wc_chacha_crypt_bytes(ChaCha* ctx, byte* c,
         "blt	L_chacha_crypt_bytes_arm32_lt_256_%=\n\t"
         "str	%[ctx], [sp, #28]\n\t"
         "\n"
-    "L_chacha_crypt_bytes_arm32_start_256_%=: \n\t"
+    "L_chacha_crypt_bytes_arm32_start_256_%=:\n\t"
         "str	%[m], [sp, #32]\n\t"
         "str	%[c], [sp, #36]\n\t"
         "str	%[len], [sp, #40]\n\t"
@@ -676,7 +674,7 @@ WC_OMIT_FRAME_POINTER void wc_chacha_crypt_bytes(ChaCha* ctx, byte* c,
         /* Set number of odd+even rounds to perform */
         "mov	lr, #10\n\t"
         "\n"
-    "L_chacha_crypt_bytes_arm32_round_start_256_%=: \n\t"
+    "L_chacha_crypt_bytes_arm32_round_start_256_%=:\n\t"
         "subs	lr, lr, #1\n\t"
         /* Round odd */
         /* a += b; d ^= a; d <<<= 16; */
@@ -986,7 +984,7 @@ WC_OMIT_FRAME_POINTER void wc_chacha_crypt_bytes(ChaCha* ctx, byte* c,
         "cmp	%[len], #0x100\n\t"
         "bge	L_chacha_crypt_bytes_arm32_start_256_%=\n\t"
         "\n"
-    "L_chacha_crypt_bytes_arm32_lt_256_%=: \n\t"
+    "L_chacha_crypt_bytes_arm32_lt_256_%=:\n\t"
         "cmp	%[len], #0x80\n\t"
         "blt	L_chacha_crypt_bytes_arm32_lt_128_%=\n\t"
         /* Move state into vector registers */
@@ -1006,7 +1004,7 @@ WC_OMIT_FRAME_POINTER void wc_chacha_crypt_bytes(ChaCha* ctx, byte* c,
         /* Set number of odd+even rounds to perform */
         "mov	lr, #10\n\t"
         "\n"
-    "L_chacha_crypt_bytes_arm32_round_start_128_%=: \n\t"
+    "L_chacha_crypt_bytes_arm32_round_start_128_%=:\n\t"
         "subs	lr, lr, #1\n\t"
         /* Round odd */
         /* a += b; d ^= a; d <<<= 16; */
@@ -1124,7 +1122,7 @@ WC_OMIT_FRAME_POINTER void wc_chacha_crypt_bytes(ChaCha* ctx, byte* c,
         "sub	%[len], %[len], #0x80\n\t"
         /* Done 128-byte block */
         "\n"
-    "L_chacha_crypt_bytes_arm32_lt_128_%=: \n\t"
+    "L_chacha_crypt_bytes_arm32_lt_128_%=:\n\t"
         "cmp	%[len], #0\n\t"
         "beq	L_chacha_crypt_bytes_arm32_done_all_%=\n\t"
         "mov	r12, #1\n\t"
@@ -1133,7 +1131,7 @@ WC_OMIT_FRAME_POINTER void wc_chacha_crypt_bytes(ChaCha* ctx, byte* c,
         "vmov	d18[0], r12\n\t"
         "mov	r12, #0x40\n\t"
         "\n"
-    "L_chacha_crypt_bytes_arm32_loop_64_%=: \n\t"
+    "L_chacha_crypt_bytes_arm32_loop_64_%=:\n\t"
         /* Move state into vector registers */
         "vmov	q0, q12\n\t"
         "vmov	q1, q13\n\t"
@@ -1142,7 +1140,7 @@ WC_OMIT_FRAME_POINTER void wc_chacha_crypt_bytes(ChaCha* ctx, byte* c,
         /* Set number of odd+even rounds to perform */
         "mov	lr, #10\n\t"
         "\n"
-    "L_chacha_crypt_bytes_arm32_round_64_%=: \n\t"
+    "L_chacha_crypt_bytes_arm32_round_64_%=:\n\t"
         "subs	lr, lr, #1\n\t"
         /* Round odd */
         /* a += b; d ^= a; d <<<= 16; */
@@ -1214,7 +1212,7 @@ WC_OMIT_FRAME_POINTER void wc_chacha_crypt_bytes(ChaCha* ctx, byte* c,
         "bne	L_chacha_crypt_bytes_arm32_loop_64_%=\n\t"
         "b	L_chacha_crypt_bytes_arm32_done_%=\n\t"
         "\n"
-    "L_chacha_crypt_bytes_arm32_lt_64_%=: \n\t"
+    "L_chacha_crypt_bytes_arm32_lt_64_%=:\n\t"
         /* Calculate bytes left in block not used */
         "sub	r12, r12, %[len]\n\t"
         /* Store encipher block in over for further operations and left */
@@ -1233,7 +1231,7 @@ WC_OMIT_FRAME_POINTER void wc_chacha_crypt_bytes(ChaCha* ctx, byte* c,
         "vmov	q1, q3\n\t"
         "beq	L_chacha_crypt_bytes_arm32_done_%=\n\t"
         "\n"
-    "L_chacha_crypt_bytes_arm32_lt_32_%=: \n\t"
+    "L_chacha_crypt_bytes_arm32_lt_32_%=:\n\t"
         "cmp	%[len], #16\n\t"
         "blt	L_chacha_crypt_bytes_arm32_lt_16_%=\n\t"
         /* Encipher 16 bytes */
@@ -1244,7 +1242,7 @@ WC_OMIT_FRAME_POINTER void wc_chacha_crypt_bytes(ChaCha* ctx, byte* c,
         "vmov	q0, q1\n\t"
         "beq	L_chacha_crypt_bytes_arm32_done_%=\n\t"
         "\n"
-    "L_chacha_crypt_bytes_arm32_lt_16_%=: \n\t"
+    "L_chacha_crypt_bytes_arm32_lt_16_%=:\n\t"
         "cmp	%[len], #8\n\t"
         "blt	L_chacha_crypt_bytes_arm32_lt_8_%=\n\t"
         /* Encipher 8 bytes */
@@ -1255,7 +1253,7 @@ WC_OMIT_FRAME_POINTER void wc_chacha_crypt_bytes(ChaCha* ctx, byte* c,
         "vmov	d0, d1\n\t"
         "beq	L_chacha_crypt_bytes_arm32_done_%=\n\t"
         "\n"
-    "L_chacha_crypt_bytes_arm32_lt_8_%=: \n\t"
+    "L_chacha_crypt_bytes_arm32_lt_8_%=:\n\t"
         "cmp	%[len], #4\n\t"
         "blt	L_chacha_crypt_bytes_arm32_lt_4_%=\n\t"
         /* Encipher 8 bytes */
@@ -1267,10 +1265,10 @@ WC_OMIT_FRAME_POINTER void wc_chacha_crypt_bytes(ChaCha* ctx, byte* c,
         "vshr.u64	d0, d0, #32\n\t"
         "beq	L_chacha_crypt_bytes_arm32_done_%=\n\t"
         "\n"
-    "L_chacha_crypt_bytes_arm32_lt_4_%=: \n\t"
+    "L_chacha_crypt_bytes_arm32_lt_4_%=:\n\t"
         "vmov	r12, s0\n\t"
         "\n"
-    "L_chacha_crypt_bytes_arm32loop_lt_4_%=: \n\t"
+    "L_chacha_crypt_bytes_arm32loop_lt_4_%=:\n\t"
         /* Encipher 1 byte at a time */
         "ldrb	r4, [%[m]], #1\n\t"
         "eor	r4, r4, r12\n\t"
@@ -1279,9 +1277,9 @@ WC_OMIT_FRAME_POINTER void wc_chacha_crypt_bytes(ChaCha* ctx, byte* c,
         "lsr	r12, r12, #8\n\t"
         "bgt	L_chacha_crypt_bytes_arm32loop_lt_4_%=\n\t"
         "\n"
-    "L_chacha_crypt_bytes_arm32_done_%=: \n\t"
+    "L_chacha_crypt_bytes_arm32_done_%=:\n\t"
         "\n"
-    "L_chacha_crypt_bytes_arm32_done_all_%=: \n\t"
+    "L_chacha_crypt_bytes_arm32_done_all_%=:\n\t"
         "vstm.32	%[ctx], {q12-q15}\n\t"
         "add	sp, sp, #44\n\t"
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
@@ -1297,7 +1295,7 @@ WC_OMIT_FRAME_POINTER void wc_chacha_crypt_bytes(ChaCha* ctx, byte* c,
     );
 }
 
-static const word32 L_chacha_setkey_arm32_constant[] = {
+XALIGNED(8) static const word32 L_chacha_setkey_arm32_constant[] = {
     0x61707865, 0x3120646e, 0x79622d36, 0x6b206574,
     0x61707865, 0x3320646e, 0x79622d32, 0x6b206574,
 };
@@ -1308,13 +1306,13 @@ WC_OMIT_FRAME_POINTER void wc_chacha_setkey(word32* x_p, const byte* key_p,
 #else
 WC_OMIT_FRAME_POINTER void wc_chacha_setkey(word32* x, const byte* key,
     word32 keySz)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register word32* x asm ("r0") = (word32*)x_p;
-    register const byte* key asm ("r1") = (const byte*)key_p;
-    register word32 keySz asm ("r2") = (word32)keySz_p;
-    register word32* L_chacha_setkey_arm32_constant_c asm ("r3") =
+    register word32* x __asm__ ("r0") = (word32*)x_p;
+    register const byte* key __asm__ ("r1") = (const byte*)key_p;
+    register word32 keySz __asm__ ("r2") = (word32)keySz_p;
+    register word32* L_chacha_setkey_arm32_constant_c __asm__ ("r3") =
         (word32*)&L_chacha_setkey_arm32_constant;
 #else
     register word32* L_chacha_setkey_arm32_constant_c =
@@ -1338,7 +1336,7 @@ WC_OMIT_FRAME_POINTER void wc_chacha_setkey(word32* x, const byte* key,
         "vrev32.16	q1, q1\n\t"
 #endif /* BIG_ENDIAN_ORDER */
         "\n"
-    "L_chacha_setkey_arm32_done_%=: \n\t"
+    "L_chacha_setkey_arm32_done_%=:\n\t"
         "vstm	%[x], {q1}\n\t"
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [x] "+r" (x), [key] "+r" (key), [keySz] "+r" (keySz),
@@ -1359,14 +1357,14 @@ WC_OMIT_FRAME_POINTER void wc_chacha_use_over(byte* over_p, byte* output_p,
 #else
 WC_OMIT_FRAME_POINTER void wc_chacha_use_over(byte* over, byte* output,
     const byte* input, word32 len)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register byte* over asm ("r0") = (byte*)over_p;
-    register byte* output asm ("r1") = (byte*)output_p;
-    register const byte* input asm ("r2") = (const byte*)input_p;
-    register word32 len asm ("r3") = (word32)len_p;
-    register word32* L_chacha_setkey_arm32_constant_c asm ("r12") =
+    register byte* over __asm__ ("r0") = (byte*)over_p;
+    register byte* output __asm__ ("r1") = (byte*)output_p;
+    register const byte* input __asm__ ("r2") = (const byte*)input_p;
+    register word32 len __asm__ ("r3") = (word32)len_p;
+    register word32* L_chacha_setkey_arm32_constant_c __asm__ ("r12") =
         (word32*)&L_chacha_setkey_arm32_constant;
 #else
     register word32* L_chacha_setkey_arm32_constant_c =
@@ -1376,7 +1374,7 @@ WC_OMIT_FRAME_POINTER void wc_chacha_use_over(byte* over, byte* output,
     __asm__ __volatile__ (
         "push	{%[L_chacha_setkey_arm32_constant]}\n\t"
         "\n"
-    "L_chacha_use_over_arm32_16byte_loop_%=: \n\t"
+    "L_chacha_use_over_arm32_16byte_loop_%=:\n\t"
         "cmp	%[len], #16\n\t"
         "blt	L_chacha_use_over_arm32_word_loop_%=\n\t"
         /* 16 bytes of state XORed into message. */
@@ -1388,7 +1386,7 @@ WC_OMIT_FRAME_POINTER void wc_chacha_use_over(byte* over, byte* output,
         "beq	L_chacha_use_over_arm32_done_%=\n\t"
         "b	L_chacha_use_over_arm32_16byte_loop_%=\n\t"
         "\n"
-    "L_chacha_use_over_arm32_word_loop_%=: \n\t"
+    "L_chacha_use_over_arm32_word_loop_%=:\n\t"
         "cmp	%[len], #4\n\t"
         "blt	L_chacha_use_over_arm32_byte_loop_%=\n\t"
         /* 4 bytes of state XORed into message. */
@@ -1400,7 +1398,7 @@ WC_OMIT_FRAME_POINTER void wc_chacha_use_over(byte* over, byte* output,
         "beq	L_chacha_use_over_arm32_done_%=\n\t"
         "b	L_chacha_use_over_arm32_word_loop_%=\n\t"
         "\n"
-    "L_chacha_use_over_arm32_byte_loop_%=: \n\t"
+    "L_chacha_use_over_arm32_byte_loop_%=:\n\t"
         /* 1 bytes of state XORed into message. */
         "ldrb	r12, [%[over]], #1\n\t"
         "ldrb	lr, [%[input]], #1\n\t"
@@ -1410,7 +1408,7 @@ WC_OMIT_FRAME_POINTER void wc_chacha_use_over(byte* over, byte* output,
         "beq	L_chacha_use_over_arm32_done_%=\n\t"
         "b	L_chacha_use_over_arm32_byte_loop_%=\n\t"
         "\n"
-    "L_chacha_use_over_arm32_done_%=: \n\t"
+    "L_chacha_use_over_arm32_done_%=:\n\t"
         "pop	{%[L_chacha_setkey_arm32_constant]}\n\t"
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [over] "+r" (over), [output] "+r" (output), [input] "+r" (input),
diff --git a/wolfcrypt/src/port/arm/armv8-32-curve25519.S b/wolfcrypt/src/port/arm/armv8-32-curve25519.S
index 2e333d16387..e307cb9fb4d 100644
--- a/wolfcrypt/src/port/arm/armv8-32-curve25519.S
+++ b/wolfcrypt/src/port/arm/armv8-32-curve25519.S
@@ -2737,12 +2737,12 @@ fe_mul_op:
 	lsl	r0, r0, #1
 	orr	r0, r0, r10, lsr #31
 	mul	r11, r0, lr
-	pop	{r0, r1, r2}
+	pop	{r0-r2}
 	mov	lr, #38
 	umaal	r0, r11, r12, lr
 	umaal	r1, r11, r4, lr
 	umaal	r2, r11, r5, lr
-	pop	{r3, r4, r5}
+	pop	{r3-r5}
 	umaal	r3, r11, r6, lr
 	umaal	r4, r11, r7, lr
 	umaal	r5, r11, r8, lr
@@ -3147,18 +3147,18 @@ fe_sq_op:
 	lsl	r0, r0, #1
 	orr	r0, r0, r7, lsr #31
 	mul	lr, r0, r6
-	pop	{r0, r1}
+	pop	{r0-r1}
 	mov	r6, #38
 	umaal	r0, lr, r12, r6
 	umaal	r1, lr, r11, r6
 	mov	r12, r3
 	mov	r11, r4
-	pop	{r2, r3, r4}
+	pop	{r2-r4}
 	umaal	r2, lr, r10, r6
 	umaal	r3, lr, r12, r6
 	umaal	r4, lr, r11, r6
 	mov	r12, r6
-	pop	{r5, r6}
+	pop	{r5-r6}
 	umaal	r5, lr, r8, r12
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
 	bic	r7, r7, #0x80000000
@@ -4563,18 +4563,18 @@ fe_sq2:
 	lsl	r0, r0, #1
 	orr	r0, r0, r7, lsr #31
 	mul	lr, r0, r6
-	pop	{r0, r1}
+	pop	{r0-r1}
 	mov	r6, #38
 	umaal	r0, lr, r12, r6
 	umaal	r1, lr, r11, r6
 	mov	r12, r3
 	mov	r11, r4
-	pop	{r2, r3, r4}
+	pop	{r2-r4}
 	umaal	r2, lr, r10, r6
 	umaal	r3, lr, r12, r6
 	umaal	r4, lr, r11, r6
 	mov	r12, r6
-	pop	{r5, r6}
+	pop	{r5-r6}
 	umaal	r5, lr, r8, r12
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
 	bic	r7, r7, #0x80000000
diff --git a/wolfcrypt/src/port/arm/armv8-32-curve25519_c.c b/wolfcrypt/src/port/arm/armv8-32-curve25519_c.c
index 7ed31b978c2..c981871e4bd 100644
--- a/wolfcrypt/src/port/arm/armv8-32-curve25519_c.c
+++ b/wolfcrypt/src/port/arm/armv8-32-curve25519_c.c
@@ -30,8 +30,6 @@
 
 #ifdef WOLFSSL_ARMASM
 #if !defined(__aarch64__) && !defined(WOLFSSL_ARMASM_THUMB2)
-#include <stdint.h>
-#include <wolfssl/wolfcrypt/libwolfssl_sources.h>
 #ifdef WOLFSSL_ARMASM_INLINE
 
 #ifdef __IAR_SYSTEMS_ICC__
@@ -64,7 +62,7 @@
 WC_OMIT_FRAME_POINTER void fe_init()
 #else
 WC_OMIT_FRAME_POINTER void fe_init()
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
@@ -86,7 +84,7 @@ void fe_add_sub_op(void);
 WC_OMIT_FRAME_POINTER void fe_add_sub_op()
 #else
 WC_OMIT_FRAME_POINTER void fe_add_sub_op()
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
@@ -305,7 +303,7 @@ void fe_sub_op(void);
 WC_OMIT_FRAME_POINTER void fe_sub_op()
 #else
 WC_OMIT_FRAME_POINTER void fe_sub_op()
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
@@ -357,12 +355,12 @@ WC_OMIT_FRAME_POINTER void fe_sub_op()
 WC_OMIT_FRAME_POINTER void fe_sub(fe r_p, const fe a_p, const fe b_p)
 #else
 WC_OMIT_FRAME_POINTER void fe_sub(fe r, const fe a, const fe b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sword32* r asm ("r0") = (sword32*)r_p;
-    register const sword32* a asm ("r1") = (const sword32*)a_p;
-    register const sword32* b asm ("r2") = (const sword32*)b_p;
+    register sword32* r __asm__ ("r0") = (sword32*)r_p;
+    register const sword32* a __asm__ ("r1") = (const sword32*)a_p;
+    register const sword32* b __asm__ ("r2") = (const sword32*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -384,7 +382,7 @@ void fe_add_op(void);
 WC_OMIT_FRAME_POINTER void fe_add_op()
 #else
 WC_OMIT_FRAME_POINTER void fe_add_op()
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
@@ -437,12 +435,12 @@ WC_OMIT_FRAME_POINTER void fe_add_op()
 WC_OMIT_FRAME_POINTER void fe_add(fe r_p, const fe a_p, const fe b_p)
 #else
 WC_OMIT_FRAME_POINTER void fe_add(fe r, const fe a, const fe b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sword32* r asm ("r0") = (sword32*)r_p;
-    register const sword32* a asm ("r1") = (const sword32*)a_p;
-    register const sword32* b asm ("r2") = (const sword32*)b_p;
+    register sword32* r __asm__ ("r0") = (sword32*)r_p;
+    register const sword32* a __asm__ ("r1") = (const sword32*)a_p;
+    register const sword32* b __asm__ ("r2") = (const sword32*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -464,11 +462,12 @@ WC_OMIT_FRAME_POINTER void fe_add(fe r, const fe a, const fe b)
 WC_OMIT_FRAME_POINTER void fe_frombytes(fe out_p, const unsigned char* in_p)
 #else
 WC_OMIT_FRAME_POINTER void fe_frombytes(fe out, const unsigned char* in)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sword32* out asm ("r0") = (sword32*)out_p;
-    register const unsigned char* in asm ("r1") = (const unsigned char*)in_p;
+    register sword32* out __asm__ ("r0") = (sword32*)out_p;
+    register const unsigned char* in __asm__ ("r1") =
+        (const unsigned char*)in_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -508,11 +507,11 @@ WC_OMIT_FRAME_POINTER void fe_frombytes(fe out, const unsigned char* in)
 WC_OMIT_FRAME_POINTER void fe_tobytes(unsigned char* out_p, const fe n_p)
 #else
 WC_OMIT_FRAME_POINTER void fe_tobytes(unsigned char* out, const fe n)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register unsigned char* out asm ("r0") = (unsigned char*)out_p;
-    register const sword32* n asm ("r1") = (const sword32*)n_p;
+    register unsigned char* out __asm__ ("r0") = (unsigned char*)out_p;
+    register const sword32* n __asm__ ("r1") = (const sword32*)n_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -563,10 +562,10 @@ WC_OMIT_FRAME_POINTER void fe_tobytes(unsigned char* out, const fe n)
 WC_OMIT_FRAME_POINTER void fe_1(fe n_p)
 #else
 WC_OMIT_FRAME_POINTER void fe_1(fe n)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sword32* n asm ("r0") = (sword32*)n_p;
+    register sword32* n __asm__ ("r0") = (sword32*)n_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -595,10 +594,10 @@ WC_OMIT_FRAME_POINTER void fe_1(fe n)
 WC_OMIT_FRAME_POINTER void fe_0(fe n_p)
 #else
 WC_OMIT_FRAME_POINTER void fe_0(fe n)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sword32* n asm ("r0") = (sword32*)n_p;
+    register sword32* n __asm__ ("r0") = (sword32*)n_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -627,11 +626,11 @@ WC_OMIT_FRAME_POINTER void fe_0(fe n)
 WC_OMIT_FRAME_POINTER void fe_copy(fe r_p, const fe a_p)
 #else
 WC_OMIT_FRAME_POINTER void fe_copy(fe r, const fe a)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sword32* r asm ("r0") = (sword32*)r_p;
-    register const sword32* a asm ("r1") = (const sword32*)a_p;
+    register sword32* r __asm__ ("r0") = (sword32*)r_p;
+    register const sword32* a __asm__ ("r1") = (const sword32*)a_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -697,11 +696,11 @@ WC_OMIT_FRAME_POINTER void fe_copy(fe r, const fe a)
 WC_OMIT_FRAME_POINTER void fe_neg(fe r_p, const fe a_p)
 #else
 WC_OMIT_FRAME_POINTER void fe_neg(fe r, const fe a)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sword32* r asm ("r0") = (sword32*)r_p;
-    register const sword32* a asm ("r1") = (const sword32*)a_p;
+    register sword32* r __asm__ ("r0") = (sword32*)r_p;
+    register const sword32* a __asm__ ("r1") = (const sword32*)a_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -735,10 +734,10 @@ WC_OMIT_FRAME_POINTER void fe_neg(fe r, const fe a)
 WC_OMIT_FRAME_POINTER int fe_isnonzero(const fe a_p)
 #else
 WC_OMIT_FRAME_POINTER int fe_isnonzero(const fe a)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register const sword32* a asm ("r0") = (const sword32*)a_p;
+    register const sword32* a __asm__ ("r0") = (const sword32*)a_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -790,10 +789,10 @@ WC_OMIT_FRAME_POINTER int fe_isnonzero(const fe a)
 WC_OMIT_FRAME_POINTER int fe_isnegative(const fe a_p)
 #else
 WC_OMIT_FRAME_POINTER int fe_isnegative(const fe a)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register const sword32* a asm ("r0") = (const sword32*)a_p;
+    register const sword32* a __asm__ ("r0") = (const sword32*)a_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -831,12 +830,12 @@ WC_OMIT_FRAME_POINTER void fe_cmov_table(fe* r_p, const fe* base_p,
     signed char b_p)
 #else
 WC_OMIT_FRAME_POINTER void fe_cmov_table(fe* r, const fe* base, signed char b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register fe* r asm ("r0") = (fe*)r_p;
-    register const fe* base asm ("r1") = (const fe*)base_p;
-    register signed char b asm ("r2") = (signed char)b_p;
+    register fe* r __asm__ ("r0") = (fe*)r_p;
+    register const fe* base __asm__ ("r1") = (const fe*)base_p;
+    register signed char b __asm__ ("r2") = (signed char)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -2377,12 +2376,12 @@ WC_OMIT_FRAME_POINTER void fe_cmov_table(fe* r_p, const fe* base_p,
     signed char b_p)
 #else
 WC_OMIT_FRAME_POINTER void fe_cmov_table(fe* r, const fe* base, signed char b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register fe* r asm ("r0") = (fe*)r_p;
-    register const fe* base asm ("r1") = (const fe*)base_p;
-    register signed char b asm ("r2") = (signed char)b_p;
+    register fe* r __asm__ ("r0") = (fe*)r_p;
+    register const fe* base __asm__ ("r1") = (const fe*)base_p;
+    register signed char b __asm__ ("r2") = (signed char)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -2514,7 +2513,7 @@ void fe_mul_op(void);
 WC_OMIT_FRAME_POINTER void fe_mul_op()
 #else
 WC_OMIT_FRAME_POINTER void fe_mul_op()
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
@@ -2909,7 +2908,7 @@ void fe_mul_op(void);
 WC_OMIT_FRAME_POINTER void fe_mul_op()
 #else
 WC_OMIT_FRAME_POINTER void fe_mul_op()
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
@@ -3062,12 +3061,12 @@ WC_OMIT_FRAME_POINTER void fe_mul_op()
 WC_OMIT_FRAME_POINTER void fe_mul(fe r_p, const fe a_p, const fe b_p)
 #else
 WC_OMIT_FRAME_POINTER void fe_mul(fe r, const fe a, const fe b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sword32* r asm ("r0") = (sword32*)r_p;
-    register const sword32* a asm ("r1") = (const sword32*)a_p;
-    register const sword32* b asm ("r2") = (const sword32*)b_p;
+    register sword32* r __asm__ ("r0") = (sword32*)r_p;
+    register const sword32* a __asm__ ("r1") = (const sword32*)a_p;
+    register const sword32* b __asm__ ("r2") = (const sword32*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -3090,7 +3089,7 @@ void fe_sq_op(void);
 WC_OMIT_FRAME_POINTER void fe_sq_op()
 #else
 WC_OMIT_FRAME_POINTER void fe_sq_op()
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
@@ -3378,7 +3377,7 @@ void fe_sq_op(void);
 WC_OMIT_FRAME_POINTER void fe_sq_op()
 #else
 WC_OMIT_FRAME_POINTER void fe_sq_op()
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
@@ -3517,11 +3516,11 @@ WC_OMIT_FRAME_POINTER void fe_sq_op()
 WC_OMIT_FRAME_POINTER void fe_sq(fe r_p, const fe a_p)
 #else
 WC_OMIT_FRAME_POINTER void fe_sq(fe r, const fe a)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sword32* r asm ("r0") = (sword32*)r_p;
-    register const sword32* a asm ("r1") = (const sword32*)a_p;
+    register sword32* r __asm__ ("r0") = (sword32*)r_p;
+    register const sword32* a __asm__ ("r1") = (const sword32*)a_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -3544,11 +3543,11 @@ WC_OMIT_FRAME_POINTER void fe_sq(fe r, const fe a)
 WC_OMIT_FRAME_POINTER void fe_mul121666(fe r_p, fe a_p)
 #else
 WC_OMIT_FRAME_POINTER void fe_mul121666(fe r, fe a)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sword32* r asm ("r0") = (sword32*)r_p;
-    register sword32* a asm ("r1") = (sword32*)a_p;
+    register sword32* r __asm__ ("r0") = (sword32*)r_p;
+    register sword32* a __asm__ ("r1") = (sword32*)a_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -3628,11 +3627,11 @@ WC_OMIT_FRAME_POINTER void fe_mul121666(fe r, fe a)
 WC_OMIT_FRAME_POINTER void fe_mul121666(fe r_p, fe a_p)
 #else
 WC_OMIT_FRAME_POINTER void fe_mul121666(fe r, fe a)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sword32* r asm ("r0") = (sword32*)r_p;
-    register sword32* a asm ("r1") = (sword32*)a_p;
+    register sword32* r __asm__ ("r0") = (sword32*)r_p;
+    register sword32* a __asm__ ("r1") = (sword32*)a_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -3701,12 +3700,12 @@ WC_OMIT_FRAME_POINTER int curve25519(byte* r_p, const byte* n_p,
     const byte* a_p)
 #else
 WC_OMIT_FRAME_POINTER int curve25519(byte* r, const byte* n, const byte* a)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register byte* r asm ("r0") = (byte*)r_p;
-    register const byte* n asm ("r1") = (const byte*)n_p;
-    register const byte* a asm ("r2") = (const byte*)a_p;
+    register byte* r __asm__ ("r0") = (byte*)r_p;
+    register const byte* n __asm__ ("r1") = (const byte*)n_p;
+    register const byte* a __asm__ ("r2") = (const byte*)a_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -3739,9 +3738,9 @@ WC_OMIT_FRAME_POINTER int curve25519(byte* r, const byte* n, const byte* a)
         "mov	%[a], #28\n\t"
         "str	%[a], [sp, #176]\n\t"
         "\n"
-    "L_curve25519_words_%=: \n\t"
+    "L_curve25519_words_%=:\n\t"
         "\n"
-    "L_curve25519_bits_%=: \n\t"
+    "L_curve25519_bits_%=:\n\t"
         "ldr	%[n], [sp, #164]\n\t"
         "ldr	%[a], [%[n], r2]\n\t"
         "ldr	%[n], [sp, #180]\n\t"
@@ -3957,7 +3956,7 @@ WC_OMIT_FRAME_POINTER int curve25519(byte* r, const byte* n, const byte* a)
         "bl	fe_sq_op\n\t"
         "mov	r12, #4\n\t"
         "\n"
-    "L_curve25519_inv_1_%=: \n\t"
+    "L_curve25519_inv_1_%=:\n\t"
         "add	r1, sp, #0x60\n\t"
         "add	r0, sp, #0x60\n\t"
         "push	{r12}\n\t"
@@ -3974,7 +3973,7 @@ WC_OMIT_FRAME_POINTER int curve25519(byte* r, const byte* n, const byte* a)
         "bl	fe_sq_op\n\t"
         "mov	r12, #9\n\t"
         "\n"
-    "L_curve25519_inv_2_%=: \n\t"
+    "L_curve25519_inv_2_%=:\n\t"
         "add	r1, sp, #0x60\n\t"
         "add	r0, sp, #0x60\n\t"
         "push	{r12}\n\t"
@@ -3991,7 +3990,7 @@ WC_OMIT_FRAME_POINTER int curve25519(byte* r, const byte* n, const byte* a)
         "bl	fe_sq_op\n\t"
         "mov	r12, #19\n\t"
         "\n"
-    "L_curve25519_inv_3_%=: \n\t"
+    "L_curve25519_inv_3_%=:\n\t"
         "add	r1, sp, #0x80\n\t"
         "add	r0, sp, #0x80\n\t"
         "push	{r12}\n\t"
@@ -4005,7 +4004,7 @@ WC_OMIT_FRAME_POINTER int curve25519(byte* r, const byte* n, const byte* a)
         "bl	fe_mul_op\n\t"
         "mov	r12, #10\n\t"
         "\n"
-    "L_curve25519_inv_4_%=: \n\t"
+    "L_curve25519_inv_4_%=:\n\t"
         "add	r1, sp, #0x60\n\t"
         "add	r0, sp, #0x60\n\t"
         "push	{r12}\n\t"
@@ -4022,7 +4021,7 @@ WC_OMIT_FRAME_POINTER int curve25519(byte* r, const byte* n, const byte* a)
         "bl	fe_sq_op\n\t"
         "mov	r12, #49\n\t"
         "\n"
-    "L_curve25519_inv_5_%=: \n\t"
+    "L_curve25519_inv_5_%=:\n\t"
         "add	r1, sp, #0x60\n\t"
         "add	r0, sp, #0x60\n\t"
         "push	{r12}\n\t"
@@ -4039,7 +4038,7 @@ WC_OMIT_FRAME_POINTER int curve25519(byte* r, const byte* n, const byte* a)
         "bl	fe_sq_op\n\t"
         "mov	r12, #0x63\n\t"
         "\n"
-    "L_curve25519_inv_6_%=: \n\t"
+    "L_curve25519_inv_6_%=:\n\t"
         "add	r1, sp, #0x80\n\t"
         "add	r0, sp, #0x80\n\t"
         "push	{r12}\n\t"
@@ -4053,7 +4052,7 @@ WC_OMIT_FRAME_POINTER int curve25519(byte* r, const byte* n, const byte* a)
         "bl	fe_mul_op\n\t"
         "mov	r12, #50\n\t"
         "\n"
-    "L_curve25519_inv_7_%=: \n\t"
+    "L_curve25519_inv_7_%=:\n\t"
         "add	r1, sp, #0x60\n\t"
         "add	r0, sp, #0x60\n\t"
         "push	{r12}\n\t"
@@ -4067,7 +4066,7 @@ WC_OMIT_FRAME_POINTER int curve25519(byte* r, const byte* n, const byte* a)
         "bl	fe_mul_op\n\t"
         "mov	r12, #5\n\t"
         "\n"
-    "L_curve25519_inv_8_%=: \n\t"
+    "L_curve25519_inv_8_%=:\n\t"
         "add	r1, sp, #0x40\n\t"
         "add	r0, sp, #0x40\n\t"
         "push	{r12}\n\t"
@@ -4104,12 +4103,12 @@ WC_OMIT_FRAME_POINTER int curve25519(byte* r_p, const byte* n_p,
     const byte* a_p)
 #else
 WC_OMIT_FRAME_POINTER int curve25519(byte* r, const byte* n, const byte* a)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register byte* r asm ("r0") = (byte*)r_p;
-    register const byte* n asm ("r1") = (const byte*)n_p;
-    register const byte* a asm ("r2") = (const byte*)a_p;
+    register byte* r __asm__ ("r0") = (byte*)r_p;
+    register const byte* n __asm__ ("r1") = (const byte*)n_p;
+    register const byte* a __asm__ ("r2") = (const byte*)a_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -4144,7 +4143,7 @@ WC_OMIT_FRAME_POINTER int curve25519(byte* r, const byte* n, const byte* a)
         "stm	r3, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t"
         "mov	%[a], #0xfe\n\t"
         "\n"
-    "L_curve25519_bits_%=: \n\t"
+    "L_curve25519_bits_%=:\n\t"
         "str	%[a], [sp, #168]\n\t"
         "ldr	%[n], [sp, #160]\n\t"
         "and	r4, %[a], #31\n\t"
@@ -4264,7 +4263,7 @@ WC_OMIT_FRAME_POINTER int curve25519(byte* r, const byte* n, const byte* a)
         "bl	fe_sq_op\n\t"
         "mov	r12, #4\n\t"
         "\n"
-    "L_curve25519_inv_1_%=: \n\t"
+    "L_curve25519_inv_1_%=:\n\t"
         "add	r1, sp, #0x60\n\t"
         "add	r0, sp, #0x60\n\t"
         "push	{r12}\n\t"
@@ -4281,7 +4280,7 @@ WC_OMIT_FRAME_POINTER int curve25519(byte* r, const byte* n, const byte* a)
         "bl	fe_sq_op\n\t"
         "mov	r12, #9\n\t"
         "\n"
-    "L_curve25519_inv_2_%=: \n\t"
+    "L_curve25519_inv_2_%=:\n\t"
         "add	r1, sp, #0x60\n\t"
         "add	r0, sp, #0x60\n\t"
         "push	{r12}\n\t"
@@ -4298,7 +4297,7 @@ WC_OMIT_FRAME_POINTER int curve25519(byte* r, const byte* n, const byte* a)
         "bl	fe_sq_op\n\t"
         "mov	r12, #19\n\t"
         "\n"
-    "L_curve25519_inv_3_%=: \n\t"
+    "L_curve25519_inv_3_%=:\n\t"
         "add	r1, sp, #0x80\n\t"
         "add	r0, sp, #0x80\n\t"
         "push	{r12}\n\t"
@@ -4312,7 +4311,7 @@ WC_OMIT_FRAME_POINTER int curve25519(byte* r, const byte* n, const byte* a)
         "bl	fe_mul_op\n\t"
         "mov	r12, #10\n\t"
         "\n"
-    "L_curve25519_inv_4_%=: \n\t"
+    "L_curve25519_inv_4_%=:\n\t"
         "add	r1, sp, #0x60\n\t"
         "add	r0, sp, #0x60\n\t"
         "push	{r12}\n\t"
@@ -4329,7 +4328,7 @@ WC_OMIT_FRAME_POINTER int curve25519(byte* r, const byte* n, const byte* a)
         "bl	fe_sq_op\n\t"
         "mov	r12, #49\n\t"
         "\n"
-    "L_curve25519_inv_5_%=: \n\t"
+    "L_curve25519_inv_5_%=:\n\t"
         "add	r1, sp, #0x60\n\t"
         "add	r0, sp, #0x60\n\t"
         "push	{r12}\n\t"
@@ -4346,7 +4345,7 @@ WC_OMIT_FRAME_POINTER int curve25519(byte* r, const byte* n, const byte* a)
         "bl	fe_sq_op\n\t"
         "mov	r12, #0x63\n\t"
         "\n"
-    "L_curve25519_inv_6_%=: \n\t"
+    "L_curve25519_inv_6_%=:\n\t"
         "add	r1, sp, #0x80\n\t"
         "add	r0, sp, #0x80\n\t"
         "push	{r12}\n\t"
@@ -4360,7 +4359,7 @@ WC_OMIT_FRAME_POINTER int curve25519(byte* r, const byte* n, const byte* a)
         "bl	fe_mul_op\n\t"
         "mov	r12, #50\n\t"
         "\n"
-    "L_curve25519_inv_7_%=: \n\t"
+    "L_curve25519_inv_7_%=:\n\t"
         "add	r1, sp, #0x60\n\t"
         "add	r0, sp, #0x60\n\t"
         "push	{r12}\n\t"
@@ -4374,7 +4373,7 @@ WC_OMIT_FRAME_POINTER int curve25519(byte* r, const byte* n, const byte* a)
         "bl	fe_mul_op\n\t"
         "mov	r12, #5\n\t"
         "\n"
-    "L_curve25519_inv_8_%=: \n\t"
+    "L_curve25519_inv_8_%=:\n\t"
         "add	r1, sp, #0x40\n\t"
         "add	r0, sp, #0x40\n\t"
         "push	{r12}\n\t"
@@ -4431,11 +4430,11 @@ WC_OMIT_FRAME_POINTER int curve25519(byte* r, const byte* n, const byte* a)
 WC_OMIT_FRAME_POINTER void fe_invert(fe r_p, const fe a_p)
 #else
 WC_OMIT_FRAME_POINTER void fe_invert(fe r, const fe a)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sword32* r asm ("r0") = (sword32*)r_p;
-    register const sword32* a asm ("r1") = (const sword32*)a_p;
+    register sword32* r __asm__ ("r0") = (sword32*)r_p;
+    register const sword32* a __asm__ ("r1") = (const sword32*)a_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -4472,7 +4471,7 @@ WC_OMIT_FRAME_POINTER void fe_invert(fe r, const fe a)
         "bl	fe_sq_op\n\t"
         "mov	r12, #4\n\t"
         "\n"
-    "L_fe_invert1_%=: \n\t"
+    "L_fe_invert1_%=:\n\t"
         "add	r1, sp, #0x40\n\t"
         "add	r0, sp, #0x40\n\t"
         "push	{r12}\n\t"
@@ -4489,7 +4488,7 @@ WC_OMIT_FRAME_POINTER void fe_invert(fe r, const fe a)
         "bl	fe_sq_op\n\t"
         "mov	r12, #9\n\t"
         "\n"
-    "L_fe_invert2_%=: \n\t"
+    "L_fe_invert2_%=:\n\t"
         "add	r1, sp, #0x40\n\t"
         "add	r0, sp, #0x40\n\t"
         "push	{r12}\n\t"
@@ -4506,7 +4505,7 @@ WC_OMIT_FRAME_POINTER void fe_invert(fe r, const fe a)
         "bl	fe_sq_op\n\t"
         "mov	r12, #19\n\t"
         "\n"
-    "L_fe_invert3_%=: \n\t"
+    "L_fe_invert3_%=:\n\t"
         "add	r1, sp, #0x60\n\t"
         "add	r0, sp, #0x60\n\t"
         "push	{r12}\n\t"
@@ -4520,7 +4519,7 @@ WC_OMIT_FRAME_POINTER void fe_invert(fe r, const fe a)
         "bl	fe_mul_op\n\t"
         "mov	r12, #10\n\t"
         "\n"
-    "L_fe_invert4_%=: \n\t"
+    "L_fe_invert4_%=:\n\t"
         "add	r1, sp, #0x40\n\t"
         "add	r0, sp, #0x40\n\t"
         "push	{r12}\n\t"
@@ -4537,7 +4536,7 @@ WC_OMIT_FRAME_POINTER void fe_invert(fe r, const fe a)
         "bl	fe_sq_op\n\t"
         "mov	r12, #49\n\t"
         "\n"
-    "L_fe_invert5_%=: \n\t"
+    "L_fe_invert5_%=:\n\t"
         "add	r1, sp, #0x40\n\t"
         "add	r0, sp, #0x40\n\t"
         "push	{r12}\n\t"
@@ -4554,7 +4553,7 @@ WC_OMIT_FRAME_POINTER void fe_invert(fe r, const fe a)
         "bl	fe_sq_op\n\t"
         "mov	r12, #0x63\n\t"
         "\n"
-    "L_fe_invert6_%=: \n\t"
+    "L_fe_invert6_%=:\n\t"
         "add	r1, sp, #0x60\n\t"
         "add	r0, sp, #0x60\n\t"
         "push	{r12}\n\t"
@@ -4568,7 +4567,7 @@ WC_OMIT_FRAME_POINTER void fe_invert(fe r, const fe a)
         "bl	fe_mul_op\n\t"
         "mov	r12, #50\n\t"
         "\n"
-    "L_fe_invert7_%=: \n\t"
+    "L_fe_invert7_%=:\n\t"
         "add	r1, sp, #0x40\n\t"
         "add	r0, sp, #0x40\n\t"
         "push	{r12}\n\t"
@@ -4582,7 +4581,7 @@ WC_OMIT_FRAME_POINTER void fe_invert(fe r, const fe a)
         "bl	fe_mul_op\n\t"
         "mov	r12, #5\n\t"
         "\n"
-    "L_fe_invert8_%=: \n\t"
+    "L_fe_invert8_%=:\n\t"
         "add	r1, sp, #32\n\t"
         "add	r0, sp, #32\n\t"
         "push	{r12}\n\t"
@@ -4614,11 +4613,11 @@ WC_OMIT_FRAME_POINTER void fe_invert(fe r, const fe a)
 WC_OMIT_FRAME_POINTER void fe_sq2(fe r_p, const fe a_p)
 #else
 WC_OMIT_FRAME_POINTER void fe_sq2(fe r, const fe a)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sword32* r asm ("r0") = (sword32*)r_p;
-    register const sword32* a asm ("r1") = (const sword32*)a_p;
+    register sword32* r __asm__ ("r0") = (sword32*)r_p;
+    register const sword32* a __asm__ ("r1") = (const sword32*)a_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -4945,11 +4944,11 @@ WC_OMIT_FRAME_POINTER void fe_sq2(fe r, const fe a)
 WC_OMIT_FRAME_POINTER void fe_sq2(fe r_p, const fe a_p)
 #else
 WC_OMIT_FRAME_POINTER void fe_sq2(fe r, const fe a)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sword32* r asm ("r0") = (sword32*)r_p;
-    register const sword32* a asm ("r1") = (const sword32*)a_p;
+    register sword32* r __asm__ ("r0") = (sword32*)r_p;
+    register const sword32* a __asm__ ("r1") = (const sword32*)a_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -5135,11 +5134,11 @@ WC_OMIT_FRAME_POINTER void fe_sq2(fe r, const fe a)
 WC_OMIT_FRAME_POINTER void fe_pow22523(fe r_p, const fe a_p)
 #else
 WC_OMIT_FRAME_POINTER void fe_pow22523(fe r, const fe a)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sword32* r asm ("r0") = (sword32*)r_p;
-    register const sword32* a asm ("r1") = (const sword32*)a_p;
+    register sword32* r __asm__ ("r0") = (sword32*)r_p;
+    register const sword32* a __asm__ ("r1") = (const sword32*)a_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -5176,7 +5175,7 @@ WC_OMIT_FRAME_POINTER void fe_pow22523(fe r, const fe a)
         "bl	fe_sq_op\n\t"
         "mov	r12, #4\n\t"
         "\n"
-    "L_fe_pow22523_1_%=: \n\t"
+    "L_fe_pow22523_1_%=:\n\t"
         "add	r1, sp, #32\n\t"
         "add	r0, sp, #32\n\t"
         "push	{r12}\n\t"
@@ -5193,7 +5192,7 @@ WC_OMIT_FRAME_POINTER void fe_pow22523(fe r, const fe a)
         "bl	fe_sq_op\n\t"
         "mov	r12, #9\n\t"
         "\n"
-    "L_fe_pow22523_2_%=: \n\t"
+    "L_fe_pow22523_2_%=:\n\t"
         "add	r1, sp, #32\n\t"
         "add	r0, sp, #32\n\t"
         "push	{r12}\n\t"
@@ -5210,7 +5209,7 @@ WC_OMIT_FRAME_POINTER void fe_pow22523(fe r, const fe a)
         "bl	fe_sq_op\n\t"
         "mov	r12, #19\n\t"
         "\n"
-    "L_fe_pow22523_3_%=: \n\t"
+    "L_fe_pow22523_3_%=:\n\t"
         "add	r1, sp, #0x40\n\t"
         "add	r0, sp, #0x40\n\t"
         "push	{r12}\n\t"
@@ -5224,7 +5223,7 @@ WC_OMIT_FRAME_POINTER void fe_pow22523(fe r, const fe a)
         "bl	fe_mul_op\n\t"
         "mov	r12, #10\n\t"
         "\n"
-    "L_fe_pow22523_4_%=: \n\t"
+    "L_fe_pow22523_4_%=:\n\t"
         "add	r1, sp, #32\n\t"
         "add	r0, sp, #32\n\t"
         "push	{r12}\n\t"
@@ -5241,7 +5240,7 @@ WC_OMIT_FRAME_POINTER void fe_pow22523(fe r, const fe a)
         "bl	fe_sq_op\n\t"
         "mov	r12, #49\n\t"
         "\n"
-    "L_fe_pow22523_5_%=: \n\t"
+    "L_fe_pow22523_5_%=:\n\t"
         "add	r1, sp, #32\n\t"
         "add	r0, sp, #32\n\t"
         "push	{r12}\n\t"
@@ -5258,7 +5257,7 @@ WC_OMIT_FRAME_POINTER void fe_pow22523(fe r, const fe a)
         "bl	fe_sq_op\n\t"
         "mov	r12, #0x63\n\t"
         "\n"
-    "L_fe_pow22523_6_%=: \n\t"
+    "L_fe_pow22523_6_%=:\n\t"
         "add	r1, sp, #0x40\n\t"
         "add	r0, sp, #0x40\n\t"
         "push	{r12}\n\t"
@@ -5272,7 +5271,7 @@ WC_OMIT_FRAME_POINTER void fe_pow22523(fe r, const fe a)
         "bl	fe_mul_op\n\t"
         "mov	r12, #50\n\t"
         "\n"
-    "L_fe_pow22523_7_%=: \n\t"
+    "L_fe_pow22523_7_%=:\n\t"
         "add	r1, sp, #32\n\t"
         "add	r0, sp, #32\n\t"
         "push	{r12}\n\t"
@@ -5286,7 +5285,7 @@ WC_OMIT_FRAME_POINTER void fe_pow22523(fe r, const fe a)
         "bl	fe_mul_op\n\t"
         "mov	r12, #2\n\t"
         "\n"
-    "L_fe_pow22523_8_%=: \n\t"
+    "L_fe_pow22523_8_%=:\n\t"
         "mov	r1, sp\n\t"
         "mov	r0, sp\n\t"
         "push	{r12}\n\t"
@@ -5317,11 +5316,11 @@ WC_OMIT_FRAME_POINTER void fe_pow22523(fe r, const fe a)
 WC_OMIT_FRAME_POINTER void ge_p1p1_to_p2(ge_p2 * r_p, const ge_p1p1 * p_p)
 #else
 WC_OMIT_FRAME_POINTER void ge_p1p1_to_p2(ge_p2 * r, const ge_p1p1 * p)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register ge_p2 * r asm ("r0") = (ge_p2 *)r_p;
-    register const ge_p1p1 * p asm ("r1") = (const ge_p1p1 *)p_p;
+    register ge_p2 * r __asm__ ("r0") = (ge_p2 *)r_p;
+    register const ge_p1p1 * p __asm__ ("r1") = (const ge_p1p1 *)p_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -5359,11 +5358,11 @@ WC_OMIT_FRAME_POINTER void ge_p1p1_to_p2(ge_p2 * r, const ge_p1p1 * p)
 WC_OMIT_FRAME_POINTER void ge_p1p1_to_p3(ge_p3 * r_p, const ge_p1p1 * p_p)
 #else
 WC_OMIT_FRAME_POINTER void ge_p1p1_to_p3(ge_p3 * r, const ge_p1p1 * p)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register ge_p3 * r asm ("r0") = (ge_p3 *)r_p;
-    register const ge_p1p1 * p asm ("r1") = (const ge_p1p1 *)p_p;
+    register ge_p3 * r __asm__ ("r0") = (ge_p3 *)r_p;
+    register const ge_p1p1 * p __asm__ ("r1") = (const ge_p1p1 *)p_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -5406,11 +5405,11 @@ WC_OMIT_FRAME_POINTER void ge_p1p1_to_p3(ge_p3 * r, const ge_p1p1 * p)
 WC_OMIT_FRAME_POINTER void ge_p2_dbl(ge_p1p1 * r_p, const ge_p2 * p_p)
 #else
 WC_OMIT_FRAME_POINTER void ge_p2_dbl(ge_p1p1 * r, const ge_p2 * p)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register ge_p1p1 * r asm ("r0") = (ge_p1p1 *)r_p;
-    register const ge_p2 * p asm ("r1") = (const ge_p2 *)p_p;
+    register ge_p1p1 * r __asm__ ("r0") = (ge_p1p1 *)r_p;
+    register const ge_p2 * p __asm__ ("r1") = (const ge_p2 *)p_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -5467,12 +5466,12 @@ WC_OMIT_FRAME_POINTER void ge_madd(ge_p1p1 * r_p, const ge_p3 * p_p,
 #else
 WC_OMIT_FRAME_POINTER void ge_madd(ge_p1p1 * r, const ge_p3 * p,
     const ge_precomp * q)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register ge_p1p1 * r asm ("r0") = (ge_p1p1 *)r_p;
-    register const ge_p3 * p asm ("r1") = (const ge_p3 *)p_p;
-    register const ge_precomp * q asm ("r2") = (const ge_precomp *)q_p;
+    register ge_p1p1 * r __asm__ ("r0") = (ge_p1p1 *)r_p;
+    register const ge_p3 * p __asm__ ("r1") = (const ge_p3 *)p_p;
+    register const ge_precomp * q __asm__ ("r2") = (const ge_precomp *)q_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -5567,12 +5566,12 @@ WC_OMIT_FRAME_POINTER void ge_msub(ge_p1p1 * r_p, const ge_p3 * p_p,
 #else
 WC_OMIT_FRAME_POINTER void ge_msub(ge_p1p1 * r, const ge_p3 * p,
     const ge_precomp * q)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register ge_p1p1 * r asm ("r0") = (ge_p1p1 *)r_p;
-    register const ge_p3 * p asm ("r1") = (const ge_p3 *)p_p;
-    register const ge_precomp * q asm ("r2") = (const ge_precomp *)q_p;
+    register ge_p1p1 * r __asm__ ("r0") = (ge_p1p1 *)r_p;
+    register const ge_p3 * p __asm__ ("r1") = (const ge_p3 *)p_p;
+    register const ge_precomp * q __asm__ ("r2") = (const ge_precomp *)q_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -5668,12 +5667,12 @@ WC_OMIT_FRAME_POINTER void ge_add(ge_p1p1 * r_p, const ge_p3 * p_p,
 #else
 WC_OMIT_FRAME_POINTER void ge_add(ge_p1p1 * r, const ge_p3 * p,
     const ge_cached* q)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register ge_p1p1 * r asm ("r0") = (ge_p1p1 *)r_p;
-    register const ge_p3 * p asm ("r1") = (const ge_p3 *)p_p;
-    register const ge_cached* q asm ("r2") = (const ge_cached*)q_p;
+    register ge_p1p1 * r __asm__ ("r0") = (ge_p1p1 *)r_p;
+    register const ge_p3 * p __asm__ ("r1") = (const ge_p3 *)p_p;
+    register const ge_cached* q __asm__ ("r2") = (const ge_cached*)q_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -5769,12 +5768,12 @@ WC_OMIT_FRAME_POINTER void ge_sub(ge_p1p1 * r_p, const ge_p3 * p_p,
 #else
 WC_OMIT_FRAME_POINTER void ge_sub(ge_p1p1 * r, const ge_p3 * p,
     const ge_cached* q)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register ge_p1p1 * r asm ("r0") = (ge_p1p1 *)r_p;
-    register const ge_p3 * p asm ("r1") = (const ge_p3 *)p_p;
-    register const ge_cached* q asm ("r2") = (const ge_cached*)q_p;
+    register ge_p1p1 * r __asm__ ("r0") = (ge_p1p1 *)r_p;
+    register const ge_p3 * p __asm__ ("r1") = (const ge_p3 *)p_p;
+    register const ge_cached* q __asm__ ("r2") = (const ge_cached*)q_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -5871,10 +5870,10 @@ WC_OMIT_FRAME_POINTER void ge_sub(ge_p1p1 * r, const ge_p3 * p,
 WC_OMIT_FRAME_POINTER void sc_reduce(byte* s_p)
 #else
 WC_OMIT_FRAME_POINTER void sc_reduce(byte* s)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register byte* s asm ("r0") = (byte*)s_p;
+    register byte* s __asm__ ("r0") = (byte*)s_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -6671,10 +6670,10 @@ WC_OMIT_FRAME_POINTER void sc_reduce(byte* s)
 WC_OMIT_FRAME_POINTER void sc_reduce(byte* s_p)
 #else
 WC_OMIT_FRAME_POINTER void sc_reduce(byte* s)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register byte* s asm ("r0") = (byte*)s_p;
+    register byte* s __asm__ ("r0") = (byte*)s_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -7346,13 +7345,13 @@ WC_OMIT_FRAME_POINTER void sc_muladd(byte* s_p, const byte* a_p,
 #else
 WC_OMIT_FRAME_POINTER void sc_muladd(byte* s, const byte* a, const byte* b,
     const byte* c)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register byte* s asm ("r0") = (byte*)s_p;
-    register const byte* a asm ("r1") = (const byte*)a_p;
-    register const byte* b asm ("r2") = (const byte*)b_p;
-    register const byte* c asm ("r3") = (const byte*)c_p;
+    register byte* s __asm__ ("r0") = (byte*)s_p;
+    register const byte* a __asm__ ("r1") = (const byte*)a_p;
+    register const byte* b __asm__ ("r2") = (const byte*)b_p;
+    register const byte* c __asm__ ("r3") = (const byte*)c_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -8508,13 +8507,13 @@ WC_OMIT_FRAME_POINTER void sc_muladd(byte* s_p, const byte* a_p,
 #else
 WC_OMIT_FRAME_POINTER void sc_muladd(byte* s, const byte* a, const byte* b,
     const byte* c)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register byte* s asm ("r0") = (byte*)s_p;
-    register const byte* a asm ("r1") = (const byte*)a_p;
-    register const byte* b asm ("r2") = (const byte*)b_p;
-    register const byte* c asm ("r3") = (const byte*)c_p;
+    register byte* s __asm__ ("r0") = (byte*)s_p;
+    register const byte* a __asm__ ("r1") = (const byte*)a_p;
+    register const byte* b __asm__ ("r2") = (const byte*)b_p;
+    register const byte* c __asm__ ("r3") = (const byte*)c_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
diff --git a/wolfcrypt/src/port/arm/armv8-32-mlkem-asm.S b/wolfcrypt/src/port/arm/armv8-32-mlkem-asm.S
index 90a275c2326..638a0310d12 100644
--- a/wolfcrypt/src/port/arm/armv8-32-mlkem-asm.S
+++ b/wolfcrypt/src/port/arm/armv8-32-mlkem-asm.S
@@ -31,139 +31,36 @@
 #if !defined(__aarch64__) && !defined(WOLFSSL_ARMASM_THUMB2)
 #ifndef WOLFSSL_ARMASM_INLINE
 #ifdef WOLFSSL_WC_MLKEM
+#ifndef __APPLE__
 	.text
 	.type	L_mlkem_arm32_ntt_zetas, %object
 	.size	L_mlkem_arm32_ntt_zetas, 256
-	.align	4
+#else
+	.section	__DATA,__data
+#endif /* __APPLE__ */
+	# 4-byte aligned, 32-bit aligned
+#ifndef __APPLE__
+	.align	2
+#else
+	.p2align	2
+#endif /* __APPLE__ */
 L_mlkem_arm32_ntt_zetas:
-	.short	0x8ed
-	.short	0xa0b
-	.short	0xb9a
-	.short	0x714
-	.short	0x5d5
-	.short	0x58e
-	.short	0x11f
-	.short	0xca
-	.short	0xc56
-	.short	0x26e
-	.short	0x629
-	.short	0xb6
-	.short	0x3c2
-	.short	0x84f
-	.short	0x73f
-	.short	0x5bc
-	.short	0x23d
-	.short	0x7d4
-	.short	0x108
-	.short	0x17f
-	.short	0x9c4
-	.short	0x5b2
-	.short	0x6bf
-	.short	0xc7f
-	.short	0xa58
-	.short	0x3f9
-	.short	0x2dc
-	.short	0x260
-	.short	0x6fb
-	.short	0x19b
-	.short	0xc34
-	.short	0x6de
-	.short	0x4c7
-	.short	0x28c
-	.short	0xad9
-	.short	0x3f7
-	.short	0x7f4
-	.short	0x5d3
-	.short	0xbe7
-	.short	0x6f9
-	.short	0x204
-	.short	0xcf9
-	.short	0xbc1
-	.short	0xa67
-	.short	0x6af
-	.short	0x877
-	.short	0x7e
-	.short	0x5bd
-	.short	0x9ac
-	.short	0xca7
-	.short	0xbf2
-	.short	0x33e
-	.short	0x6b
-	.short	0x774
-	.short	0xc0a
-	.short	0x94a
-	.short	0xb73
-	.short	0x3c1
-	.short	0x71d
-	.short	0xa2c
-	.short	0x1c0
-	.short	0x8d8
-	.short	0x2a5
-	.short	0x806
-	.short	0x8b2
-	.short	0x1ae
-	.short	0x22b
-	.short	0x34b
-	.short	0x81e
-	.short	0x367
-	.short	0x60e
-	.short	0x69
-	.short	0x1a6
-	.short	0x24b
-	.short	0xb1
-	.short	0xc16
-	.short	0xbde
-	.short	0xb35
-	.short	0x626
-	.short	0x675
-	.short	0xc0b
-	.short	0x30a
-	.short	0x487
-	.short	0xc6e
-	.short	0x9f8
-	.short	0x5cb
-	.short	0xaa7
-	.short	0x45f
-	.short	0x6cb
-	.short	0x284
-	.short	0x999
-	.short	0x15d
-	.short	0x1a2
-	.short	0x149
-	.short	0xc65
-	.short	0xcb6
-	.short	0x331
-	.short	0x449
-	.short	0x25b
-	.short	0x262
-	.short	0x52a
-	.short	0x7fc
-	.short	0x748
-	.short	0x180
-	.short	0x842
-	.short	0xc79
-	.short	0x4c2
-	.short	0x7ca
-	.short	0x997
-	.short	0xdc
-	.short	0x85e
-	.short	0x686
-	.short	0x860
-	.short	0x707
-	.short	0x803
-	.short	0x31a
-	.short	0x71b
-	.short	0x9ab
-	.short	0x99b
-	.short	0x1de
-	.short	0xc95
-	.short	0xbcd
-	.short	0x3e4
-	.short	0x3df
-	.short	0x3be
-	.short	0x74d
-	.short	0x5f2
-	.short	0x65c
+	.short	0x08ed,0x0a0b,0x0b9a,0x0714,0x05d5,0x058e,0x011f,0x00ca
+	.short	0x0c56,0x026e,0x0629,0x00b6,0x03c2,0x084f,0x073f,0x05bc
+	.short	0x023d,0x07d4,0x0108,0x017f,0x09c4,0x05b2,0x06bf,0x0c7f
+	.short	0x0a58,0x03f9,0x02dc,0x0260,0x06fb,0x019b,0x0c34,0x06de
+	.short	0x04c7,0x028c,0x0ad9,0x03f7,0x07f4,0x05d3,0x0be7,0x06f9
+	.short	0x0204,0x0cf9,0x0bc1,0x0a67,0x06af,0x0877,0x007e,0x05bd
+	.short	0x09ac,0x0ca7,0x0bf2,0x033e,0x006b,0x0774,0x0c0a,0x094a
+	.short	0x0b73,0x03c1,0x071d,0x0a2c,0x01c0,0x08d8,0x02a5,0x0806
+	.short	0x08b2,0x01ae,0x022b,0x034b,0x081e,0x0367,0x060e,0x0069
+	.short	0x01a6,0x024b,0x00b1,0x0c16,0x0bde,0x0b35,0x0626,0x0675
+	.short	0x0c0b,0x030a,0x0487,0x0c6e,0x09f8,0x05cb,0x0aa7,0x045f
+	.short	0x06cb,0x0284,0x0999,0x015d,0x01a2,0x0149,0x0c65,0x0cb6
+	.short	0x0331,0x0449,0x025b,0x0262,0x052a,0x07fc,0x0748,0x0180
+	.short	0x0842,0x0c79,0x04c2,0x07ca,0x0997,0x00dc,0x085e,0x0686
+	.short	0x0860,0x0707,0x0803,0x031a,0x071b,0x09ab,0x099b,0x01de
+	.short	0x0c95,0x0bcd,0x03e4,0x03df,0x03be,0x074d,0x05f2,0x065c
 	.text
 	.align	4
 	.globl	mlkem_arm32_ntt
@@ -3155,139 +3052,36 @@ L_mlkem_arm32_ntt_loop_567:
 	add	sp, sp, #8
 	pop	{r4, r5, r6, r7, r8, r9, r10, r11, pc}
 	.size	mlkem_arm32_ntt,.-mlkem_arm32_ntt
+#ifndef __APPLE__
 	.text
 	.type	L_mlkem_invntt_zetas_inv, %object
 	.size	L_mlkem_invntt_zetas_inv, 256
-	.align	4
+#else
+	.section	__DATA,__data
+#endif /* __APPLE__ */
+	# 4-byte aligned, 32-bit aligned
+#ifndef __APPLE__
+	.align	2
+#else
+	.p2align	2
+#endif /* __APPLE__ */
 L_mlkem_invntt_zetas_inv:
-	.short	0x6a5
-	.short	0x70f
-	.short	0x5b4
-	.short	0x943
-	.short	0x922
-	.short	0x91d
-	.short	0x134
-	.short	0x6c
-	.short	0xb23
-	.short	0x366
-	.short	0x356
-	.short	0x5e6
-	.short	0x9e7
-	.short	0x4fe
-	.short	0x5fa
-	.short	0x4a1
-	.short	0x67b
-	.short	0x4a3
-	.short	0xc25
-	.short	0x36a
-	.short	0x537
-	.short	0x83f
-	.short	0x88
-	.short	0x4bf
-	.short	0xb81
-	.short	0x5b9
-	.short	0x505
-	.short	0x7d7
-	.short	0xa9f
-	.short	0xaa6
-	.short	0x8b8
-	.short	0x9d0
-	.short	0x4b
-	.short	0x9c
-	.short	0xbb8
-	.short	0xb5f
-	.short	0xba4
-	.short	0x368
-	.short	0xa7d
-	.short	0x636
-	.short	0x8a2
-	.short	0x25a
-	.short	0x736
-	.short	0x309
-	.short	0x93
-	.short	0x87a
-	.short	0x9f7
-	.short	0xf6
-	.short	0x68c
-	.short	0x6db
-	.short	0x1cc
-	.short	0x123
-	.short	0xeb
-	.short	0xc50
-	.short	0xab6
-	.short	0xb5b
-	.short	0xc98
-	.short	0x6f3
-	.short	0x99a
-	.short	0x4e3
-	.short	0x9b6
-	.short	0xad6
-	.short	0xb53
-	.short	0x44f
-	.short	0x4fb
-	.short	0xa5c
-	.short	0x429
-	.short	0xb41
-	.short	0x2d5
-	.short	0x5e4
-	.short	0x940
-	.short	0x18e
-	.short	0x3b7
-	.short	0xf7
-	.short	0x58d
-	.short	0xc96
-	.short	0x9c3
-	.short	0x10f
-	.short	0x5a
-	.short	0x355
-	.short	0x744
-	.short	0xc83
-	.short	0x48a
-	.short	0x652
-	.short	0x29a
-	.short	0x140
-	.short	0x8
-	.short	0xafd
-	.short	0x608
-	.short	0x11a
-	.short	0x72e
-	.short	0x50d
-	.short	0x90a
-	.short	0x228
-	.short	0xa75
-	.short	0x83a
-	.short	0x623
-	.short	0xcd
-	.short	0xb66
-	.short	0x606
-	.short	0xaa1
-	.short	0xa25
-	.short	0x908
-	.short	0x2a9
-	.short	0x82
-	.short	0x642
-	.short	0x74f
-	.short	0x33d
-	.short	0xb82
-	.short	0xbf9
-	.short	0x52d
-	.short	0xac4
-	.short	0x745
-	.short	0x5c2
-	.short	0x4b2
-	.short	0x93f
-	.short	0xc4b
-	.short	0x6d8
-	.short	0xa93
-	.short	0xab
-	.short	0xc37
-	.short	0xbe2
-	.short	0x773
-	.short	0x72c
-	.short	0x5ed
-	.short	0x167
-	.short	0x2f6
-	.short	0x5a1
+	.short	0x06a5,0x070f,0x05b4,0x0943,0x0922,0x091d,0x0134,0x006c
+	.short	0x0b23,0x0366,0x0356,0x05e6,0x09e7,0x04fe,0x05fa,0x04a1
+	.short	0x067b,0x04a3,0x0c25,0x036a,0x0537,0x083f,0x0088,0x04bf
+	.short	0x0b81,0x05b9,0x0505,0x07d7,0x0a9f,0x0aa6,0x08b8,0x09d0
+	.short	0x004b,0x009c,0x0bb8,0x0b5f,0x0ba4,0x0368,0x0a7d,0x0636
+	.short	0x08a2,0x025a,0x0736,0x0309,0x0093,0x087a,0x09f7,0x00f6
+	.short	0x068c,0x06db,0x01cc,0x0123,0x00eb,0x0c50,0x0ab6,0x0b5b
+	.short	0x0c98,0x06f3,0x099a,0x04e3,0x09b6,0x0ad6,0x0b53,0x044f
+	.short	0x04fb,0x0a5c,0x0429,0x0b41,0x02d5,0x05e4,0x0940,0x018e
+	.short	0x03b7,0x00f7,0x058d,0x0c96,0x09c3,0x010f,0x005a,0x0355
+	.short	0x0744,0x0c83,0x048a,0x0652,0x029a,0x0140,0x0008,0x0afd
+	.short	0x0608,0x011a,0x072e,0x050d,0x090a,0x0228,0x0a75,0x083a
+	.short	0x0623,0x00cd,0x0b66,0x0606,0x0aa1,0x0a25,0x0908,0x02a9
+	.short	0x0082,0x0642,0x074f,0x033d,0x0b82,0x0bf9,0x052d,0x0ac4
+	.short	0x0745,0x05c2,0x04b2,0x093f,0x0c4b,0x06d8,0x0a93,0x00ab
+	.short	0x0c37,0x0be2,0x0773,0x072c,0x05ed,0x0167,0x02f6,0x05a1
 	.text
 	.align	4
 	.globl	mlkem_arm32_invntt
@@ -7673,139 +7467,36 @@ L_mlkem_invntt_loop_321:
 	add	sp, sp, #8
 	pop	{r4, r5, r6, r7, r8, r9, r10, r11, pc}
 	.size	mlkem_arm32_invntt,.-mlkem_arm32_invntt
+#ifndef __APPLE__
 	.text
 	.type	L_mlkem_basemul_mont_zetas, %object
 	.size	L_mlkem_basemul_mont_zetas, 256
-	.align	4
+#else
+	.section	__DATA,__data
+#endif /* __APPLE__ */
+	# 4-byte aligned, 32-bit aligned
+#ifndef __APPLE__
+	.align	2
+#else
+	.p2align	2
+#endif /* __APPLE__ */
 L_mlkem_basemul_mont_zetas:
-	.short	0x8ed
-	.short	0xa0b
-	.short	0xb9a
-	.short	0x714
-	.short	0x5d5
-	.short	0x58e
-	.short	0x11f
-	.short	0xca
-	.short	0xc56
-	.short	0x26e
-	.short	0x629
-	.short	0xb6
-	.short	0x3c2
-	.short	0x84f
-	.short	0x73f
-	.short	0x5bc
-	.short	0x23d
-	.short	0x7d4
-	.short	0x108
-	.short	0x17f
-	.short	0x9c4
-	.short	0x5b2
-	.short	0x6bf
-	.short	0xc7f
-	.short	0xa58
-	.short	0x3f9
-	.short	0x2dc
-	.short	0x260
-	.short	0x6fb
-	.short	0x19b
-	.short	0xc34
-	.short	0x6de
-	.short	0x4c7
-	.short	0x28c
-	.short	0xad9
-	.short	0x3f7
-	.short	0x7f4
-	.short	0x5d3
-	.short	0xbe7
-	.short	0x6f9
-	.short	0x204
-	.short	0xcf9
-	.short	0xbc1
-	.short	0xa67
-	.short	0x6af
-	.short	0x877
-	.short	0x7e
-	.short	0x5bd
-	.short	0x9ac
-	.short	0xca7
-	.short	0xbf2
-	.short	0x33e
-	.short	0x6b
-	.short	0x774
-	.short	0xc0a
-	.short	0x94a
-	.short	0xb73
-	.short	0x3c1
-	.short	0x71d
-	.short	0xa2c
-	.short	0x1c0
-	.short	0x8d8
-	.short	0x2a5
-	.short	0x806
-	.short	0x8b2
-	.short	0x1ae
-	.short	0x22b
-	.short	0x34b
-	.short	0x81e
-	.short	0x367
-	.short	0x60e
-	.short	0x69
-	.short	0x1a6
-	.short	0x24b
-	.short	0xb1
-	.short	0xc16
-	.short	0xbde
-	.short	0xb35
-	.short	0x626
-	.short	0x675
-	.short	0xc0b
-	.short	0x30a
-	.short	0x487
-	.short	0xc6e
-	.short	0x9f8
-	.short	0x5cb
-	.short	0xaa7
-	.short	0x45f
-	.short	0x6cb
-	.short	0x284
-	.short	0x999
-	.short	0x15d
-	.short	0x1a2
-	.short	0x149
-	.short	0xc65
-	.short	0xcb6
-	.short	0x331
-	.short	0x449
-	.short	0x25b
-	.short	0x262
-	.short	0x52a
-	.short	0x7fc
-	.short	0x748
-	.short	0x180
-	.short	0x842
-	.short	0xc79
-	.short	0x4c2
-	.short	0x7ca
-	.short	0x997
-	.short	0xdc
-	.short	0x85e
-	.short	0x686
-	.short	0x860
-	.short	0x707
-	.short	0x803
-	.short	0x31a
-	.short	0x71b
-	.short	0x9ab
-	.short	0x99b
-	.short	0x1de
-	.short	0xc95
-	.short	0xbcd
-	.short	0x3e4
-	.short	0x3df
-	.short	0x3be
-	.short	0x74d
-	.short	0x5f2
-	.short	0x65c
+	.short	0x08ed,0x0a0b,0x0b9a,0x0714,0x05d5,0x058e,0x011f,0x00ca
+	.short	0x0c56,0x026e,0x0629,0x00b6,0x03c2,0x084f,0x073f,0x05bc
+	.short	0x023d,0x07d4,0x0108,0x017f,0x09c4,0x05b2,0x06bf,0x0c7f
+	.short	0x0a58,0x03f9,0x02dc,0x0260,0x06fb,0x019b,0x0c34,0x06de
+	.short	0x04c7,0x028c,0x0ad9,0x03f7,0x07f4,0x05d3,0x0be7,0x06f9
+	.short	0x0204,0x0cf9,0x0bc1,0x0a67,0x06af,0x0877,0x007e,0x05bd
+	.short	0x09ac,0x0ca7,0x0bf2,0x033e,0x006b,0x0774,0x0c0a,0x094a
+	.short	0x0b73,0x03c1,0x071d,0x0a2c,0x01c0,0x08d8,0x02a5,0x0806
+	.short	0x08b2,0x01ae,0x022b,0x034b,0x081e,0x0367,0x060e,0x0069
+	.short	0x01a6,0x024b,0x00b1,0x0c16,0x0bde,0x0b35,0x0626,0x0675
+	.short	0x0c0b,0x030a,0x0487,0x0c6e,0x09f8,0x05cb,0x0aa7,0x045f
+	.short	0x06cb,0x0284,0x0999,0x015d,0x01a2,0x0149,0x0c65,0x0cb6
+	.short	0x0331,0x0449,0x025b,0x0262,0x052a,0x07fc,0x0748,0x0180
+	.short	0x0842,0x0c79,0x04c2,0x07ca,0x0997,0x00dc,0x085e,0x0686
+	.short	0x0860,0x0707,0x0803,0x031a,0x071b,0x09ab,0x099b,0x01de
+	.short	0x0c95,0x0bcd,0x03e4,0x03df,0x03be,0x074d,0x05f2,0x065c
 	.text
 	.align	4
 	.globl	mlkem_arm32_basemul_mont
diff --git a/wolfcrypt/src/port/arm/armv8-32-mlkem-asm_c.c b/wolfcrypt/src/port/arm/armv8-32-mlkem-asm_c.c
index 721fc2b3ba2..686fd9f8efd 100644
--- a/wolfcrypt/src/port/arm/armv8-32-mlkem-asm_c.c
+++ b/wolfcrypt/src/port/arm/armv8-32-mlkem-asm_c.c
@@ -30,8 +30,6 @@
 
 #ifdef WOLFSSL_ARMASM
 #if !defined(__aarch64__) && !defined(WOLFSSL_ARMASM_THUMB2)
-#include <stdint.h>
-#include <wolfssl/wolfcrypt/libwolfssl_sources.h>
 #ifdef WOLFSSL_ARMASM_INLINE
 
 #ifdef __IAR_SYSTEMS_ICC__
@@ -52,50 +50,34 @@
 #include <wolfssl/wolfcrypt/wc_mlkem.h>
 
 #ifdef WOLFSSL_WC_MLKEM
-static const word16 L_mlkem_arm32_ntt_zetas[] = {
-    0x08ed, 0x0a0b, 0x0b9a, 0x0714,
-    0x05d5, 0x058e, 0x011f, 0x00ca,
-    0x0c56, 0x026e, 0x0629, 0x00b6,
-    0x03c2, 0x084f, 0x073f, 0x05bc,
-    0x023d, 0x07d4, 0x0108, 0x017f,
-    0x09c4, 0x05b2, 0x06bf, 0x0c7f,
-    0x0a58, 0x03f9, 0x02dc, 0x0260,
-    0x06fb, 0x019b, 0x0c34, 0x06de,
-    0x04c7, 0x028c, 0x0ad9, 0x03f7,
-    0x07f4, 0x05d3, 0x0be7, 0x06f9,
-    0x0204, 0x0cf9, 0x0bc1, 0x0a67,
-    0x06af, 0x0877, 0x007e, 0x05bd,
-    0x09ac, 0x0ca7, 0x0bf2, 0x033e,
-    0x006b, 0x0774, 0x0c0a, 0x094a,
-    0x0b73, 0x03c1, 0x071d, 0x0a2c,
-    0x01c0, 0x08d8, 0x02a5, 0x0806,
-    0x08b2, 0x01ae, 0x022b, 0x034b,
-    0x081e, 0x0367, 0x060e, 0x0069,
-    0x01a6, 0x024b, 0x00b1, 0x0c16,
-    0x0bde, 0x0b35, 0x0626, 0x0675,
-    0x0c0b, 0x030a, 0x0487, 0x0c6e,
-    0x09f8, 0x05cb, 0x0aa7, 0x045f,
-    0x06cb, 0x0284, 0x0999, 0x015d,
-    0x01a2, 0x0149, 0x0c65, 0x0cb6,
-    0x0331, 0x0449, 0x025b, 0x0262,
-    0x052a, 0x07fc, 0x0748, 0x0180,
-    0x0842, 0x0c79, 0x04c2, 0x07ca,
-    0x0997, 0x00dc, 0x085e, 0x0686,
-    0x0860, 0x0707, 0x0803, 0x031a,
-    0x071b, 0x09ab, 0x099b, 0x01de,
-    0x0c95, 0x0bcd, 0x03e4, 0x03df,
-    0x03be, 0x074d, 0x05f2, 0x065c,
+XALIGNED(4) static const word16 L_mlkem_arm32_ntt_zetas[] = {
+    0x08ed, 0x0a0b, 0x0b9a, 0x0714, 0x05d5, 0x058e, 0x011f, 0x00ca,
+    0x0c56, 0x026e, 0x0629, 0x00b6, 0x03c2, 0x084f, 0x073f, 0x05bc,
+    0x023d, 0x07d4, 0x0108, 0x017f, 0x09c4, 0x05b2, 0x06bf, 0x0c7f,
+    0x0a58, 0x03f9, 0x02dc, 0x0260, 0x06fb, 0x019b, 0x0c34, 0x06de,
+    0x04c7, 0x028c, 0x0ad9, 0x03f7, 0x07f4, 0x05d3, 0x0be7, 0x06f9,
+    0x0204, 0x0cf9, 0x0bc1, 0x0a67, 0x06af, 0x0877, 0x007e, 0x05bd,
+    0x09ac, 0x0ca7, 0x0bf2, 0x033e, 0x006b, 0x0774, 0x0c0a, 0x094a,
+    0x0b73, 0x03c1, 0x071d, 0x0a2c, 0x01c0, 0x08d8, 0x02a5, 0x0806,
+    0x08b2, 0x01ae, 0x022b, 0x034b, 0x081e, 0x0367, 0x060e, 0x0069,
+    0x01a6, 0x024b, 0x00b1, 0x0c16, 0x0bde, 0x0b35, 0x0626, 0x0675,
+    0x0c0b, 0x030a, 0x0487, 0x0c6e, 0x09f8, 0x05cb, 0x0aa7, 0x045f,
+    0x06cb, 0x0284, 0x0999, 0x015d, 0x01a2, 0x0149, 0x0c65, 0x0cb6,
+    0x0331, 0x0449, 0x025b, 0x0262, 0x052a, 0x07fc, 0x0748, 0x0180,
+    0x0842, 0x0c79, 0x04c2, 0x07ca, 0x0997, 0x00dc, 0x085e, 0x0686,
+    0x0860, 0x0707, 0x0803, 0x031a, 0x071b, 0x09ab, 0x099b, 0x01de,
+    0x0c95, 0x0bcd, 0x03e4, 0x03df, 0x03be, 0x074d, 0x05f2, 0x065c,
 };
 
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
 WC_OMIT_FRAME_POINTER void mlkem_arm32_ntt(sword16* r_p)
 #else
 WC_OMIT_FRAME_POINTER void mlkem_arm32_ntt(sword16* r)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sword16* r asm ("r0") = (sword16*)r_p;
-    register word16* L_mlkem_arm32_ntt_zetas_c asm ("r1") =
+    register sword16* r __asm__ ("r0") = (sword16*)r_p;
+    register word16* L_mlkem_arm32_ntt_zetas_c __asm__ ("r1") =
         (word16*)&L_mlkem_arm32_ntt_zetas;
 #else
     register word16* L_mlkem_arm32_ntt_zetas_c =
@@ -121,7 +103,7 @@ WC_OMIT_FRAME_POINTER void mlkem_arm32_ntt(sword16* r)
 #endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH >= 6 */
         "mov	r2, #16\n\t"
         "\n"
-    "L_mlkem_arm32_ntt_loop_123_%=: \n\t"
+    "L_mlkem_arm32_ntt_loop_123_%=:\n\t"
         "str	r2, [sp]\n\t"
         "ldrh	r11, [r1, #2]\n\t"
         "ldr	r2, [%[r]]\n\t"
@@ -1238,13 +1220,13 @@ WC_OMIT_FRAME_POINTER void mlkem_arm32_ntt(sword16* r)
         "sub	%[r], %[r], #0x40\n\t"
         "mov	r3, #0\n\t"
         "\n"
-    "L_mlkem_arm32_ntt_loop_4_j_%=: \n\t"
+    "L_mlkem_arm32_ntt_loop_4_j_%=:\n\t"
         "str	r3, [sp, #4]\n\t"
         "add	r11, r1, r3, lsr #4\n\t"
         "mov	r2, #4\n\t"
         "ldr	r11, [r11, #16]\n\t"
         "\n"
-    "L_mlkem_arm32_ntt_loop_4_i_%=: \n\t"
+    "L_mlkem_arm32_ntt_loop_4_i_%=:\n\t"
         "str	r2, [sp]\n\t"
         "ldr	r2, [%[r]]\n\t"
         "ldr	r3, [%[r], #16]\n\t"
@@ -1639,7 +1621,7 @@ WC_OMIT_FRAME_POINTER void mlkem_arm32_ntt(sword16* r)
         "sub	%[r], %[r], #0x200\n\t"
         "mov	r3, #0\n\t"
         "\n"
-    "L_mlkem_arm32_ntt_loop_567_%=: \n\t"
+    "L_mlkem_arm32_ntt_loop_567_%=:\n\t"
         "add	r11, r1, r3, lsr #3\n\t"
         "str	r3, [sp, #4]\n\t"
         "ldrh	r11, [r11, #32]\n\t"
@@ -3104,50 +3086,34 @@ WC_OMIT_FRAME_POINTER void mlkem_arm32_ntt(sword16* r)
     );
 }
 
-static const word16 L_mlkem_invntt_zetas_inv[] = {
-    0x06a5, 0x070f, 0x05b4, 0x0943,
-    0x0922, 0x091d, 0x0134, 0x006c,
-    0x0b23, 0x0366, 0x0356, 0x05e6,
-    0x09e7, 0x04fe, 0x05fa, 0x04a1,
-    0x067b, 0x04a3, 0x0c25, 0x036a,
-    0x0537, 0x083f, 0x0088, 0x04bf,
-    0x0b81, 0x05b9, 0x0505, 0x07d7,
-    0x0a9f, 0x0aa6, 0x08b8, 0x09d0,
-    0x004b, 0x009c, 0x0bb8, 0x0b5f,
-    0x0ba4, 0x0368, 0x0a7d, 0x0636,
-    0x08a2, 0x025a, 0x0736, 0x0309,
-    0x0093, 0x087a, 0x09f7, 0x00f6,
-    0x068c, 0x06db, 0x01cc, 0x0123,
-    0x00eb, 0x0c50, 0x0ab6, 0x0b5b,
-    0x0c98, 0x06f3, 0x099a, 0x04e3,
-    0x09b6, 0x0ad6, 0x0b53, 0x044f,
-    0x04fb, 0x0a5c, 0x0429, 0x0b41,
-    0x02d5, 0x05e4, 0x0940, 0x018e,
-    0x03b7, 0x00f7, 0x058d, 0x0c96,
-    0x09c3, 0x010f, 0x005a, 0x0355,
-    0x0744, 0x0c83, 0x048a, 0x0652,
-    0x029a, 0x0140, 0x0008, 0x0afd,
-    0x0608, 0x011a, 0x072e, 0x050d,
-    0x090a, 0x0228, 0x0a75, 0x083a,
-    0x0623, 0x00cd, 0x0b66, 0x0606,
-    0x0aa1, 0x0a25, 0x0908, 0x02a9,
-    0x0082, 0x0642, 0x074f, 0x033d,
-    0x0b82, 0x0bf9, 0x052d, 0x0ac4,
-    0x0745, 0x05c2, 0x04b2, 0x093f,
-    0x0c4b, 0x06d8, 0x0a93, 0x00ab,
-    0x0c37, 0x0be2, 0x0773, 0x072c,
-    0x05ed, 0x0167, 0x02f6, 0x05a1,
+XALIGNED(4) static const word16 L_mlkem_invntt_zetas_inv[] = {
+    0x06a5, 0x070f, 0x05b4, 0x0943, 0x0922, 0x091d, 0x0134, 0x006c,
+    0x0b23, 0x0366, 0x0356, 0x05e6, 0x09e7, 0x04fe, 0x05fa, 0x04a1,
+    0x067b, 0x04a3, 0x0c25, 0x036a, 0x0537, 0x083f, 0x0088, 0x04bf,
+    0x0b81, 0x05b9, 0x0505, 0x07d7, 0x0a9f, 0x0aa6, 0x08b8, 0x09d0,
+    0x004b, 0x009c, 0x0bb8, 0x0b5f, 0x0ba4, 0x0368, 0x0a7d, 0x0636,
+    0x08a2, 0x025a, 0x0736, 0x0309, 0x0093, 0x087a, 0x09f7, 0x00f6,
+    0x068c, 0x06db, 0x01cc, 0x0123, 0x00eb, 0x0c50, 0x0ab6, 0x0b5b,
+    0x0c98, 0x06f3, 0x099a, 0x04e3, 0x09b6, 0x0ad6, 0x0b53, 0x044f,
+    0x04fb, 0x0a5c, 0x0429, 0x0b41, 0x02d5, 0x05e4, 0x0940, 0x018e,
+    0x03b7, 0x00f7, 0x058d, 0x0c96, 0x09c3, 0x010f, 0x005a, 0x0355,
+    0x0744, 0x0c83, 0x048a, 0x0652, 0x029a, 0x0140, 0x0008, 0x0afd,
+    0x0608, 0x011a, 0x072e, 0x050d, 0x090a, 0x0228, 0x0a75, 0x083a,
+    0x0623, 0x00cd, 0x0b66, 0x0606, 0x0aa1, 0x0a25, 0x0908, 0x02a9,
+    0x0082, 0x0642, 0x074f, 0x033d, 0x0b82, 0x0bf9, 0x052d, 0x0ac4,
+    0x0745, 0x05c2, 0x04b2, 0x093f, 0x0c4b, 0x06d8, 0x0a93, 0x00ab,
+    0x0c37, 0x0be2, 0x0773, 0x072c, 0x05ed, 0x0167, 0x02f6, 0x05a1,
 };
 
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
 WC_OMIT_FRAME_POINTER void mlkem_arm32_invntt(sword16* r_p)
 #else
 WC_OMIT_FRAME_POINTER void mlkem_arm32_invntt(sword16* r)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sword16* r asm ("r0") = (sword16*)r_p;
-    register word16* L_mlkem_invntt_zetas_inv_c asm ("r1") =
+    register sword16* r __asm__ ("r0") = (sword16*)r_p;
+    register word16* L_mlkem_invntt_zetas_inv_c __asm__ ("r1") =
         (word16*)&L_mlkem_invntt_zetas_inv;
 #else
     register word16* L_mlkem_invntt_zetas_inv_c =
@@ -3173,7 +3139,7 @@ WC_OMIT_FRAME_POINTER void mlkem_arm32_invntt(sword16* r)
 #endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH >= 6 */
         "mov	r3, #0\n\t"
         "\n"
-    "L_mlkem_invntt_loop_765_%=: \n\t"
+    "L_mlkem_invntt_loop_765_%=:\n\t"
         "add	r11, r1, r3, lsr #1\n\t"
         "str	r3, [sp, #4]\n\t"
         "ldr	r2, [%[r]]\n\t"
@@ -4774,13 +4740,13 @@ WC_OMIT_FRAME_POINTER void mlkem_arm32_invntt(sword16* r)
         "sub	%[r], %[r], #0x200\n\t"
         "mov	r3, #0\n\t"
         "\n"
-    "L_mlkem_invntt_loop_4_j_%=: \n\t"
+    "L_mlkem_invntt_loop_4_j_%=:\n\t"
         "str	r3, [sp, #4]\n\t"
         "add	r11, r1, r3, lsr #4\n\t"
         "mov	r2, #4\n\t"
         "ldr	r11, [r11, #224]\n\t"
         "\n"
-    "L_mlkem_invntt_loop_4_i_%=: \n\t"
+    "L_mlkem_invntt_loop_4_i_%=:\n\t"
         "str	r2, [sp]\n\t"
         "ldr	r2, [%[r]]\n\t"
         "ldr	r3, [%[r], #16]\n\t"
@@ -5279,7 +5245,7 @@ WC_OMIT_FRAME_POINTER void mlkem_arm32_invntt(sword16* r)
         "sub	%[r], %[r], #0x200\n\t"
         "mov	r2, #16\n\t"
         "\n"
-    "L_mlkem_invntt_loop_321_%=: \n\t"
+    "L_mlkem_invntt_loop_321_%=:\n\t"
         "str	r2, [sp]\n\t"
         "ldrh	r11, [r1, #2]\n\t"
         "ldr	r2, [%[r]]\n\t"
@@ -7550,39 +7516,23 @@ WC_OMIT_FRAME_POINTER void mlkem_arm32_invntt(sword16* r)
     );
 }
 
-static const word16 L_mlkem_basemul_mont_zetas[] = {
-    0x08ed, 0x0a0b, 0x0b9a, 0x0714,
-    0x05d5, 0x058e, 0x011f, 0x00ca,
-    0x0c56, 0x026e, 0x0629, 0x00b6,
-    0x03c2, 0x084f, 0x073f, 0x05bc,
-    0x023d, 0x07d4, 0x0108, 0x017f,
-    0x09c4, 0x05b2, 0x06bf, 0x0c7f,
-    0x0a58, 0x03f9, 0x02dc, 0x0260,
-    0x06fb, 0x019b, 0x0c34, 0x06de,
-    0x04c7, 0x028c, 0x0ad9, 0x03f7,
-    0x07f4, 0x05d3, 0x0be7, 0x06f9,
-    0x0204, 0x0cf9, 0x0bc1, 0x0a67,
-    0x06af, 0x0877, 0x007e, 0x05bd,
-    0x09ac, 0x0ca7, 0x0bf2, 0x033e,
-    0x006b, 0x0774, 0x0c0a, 0x094a,
-    0x0b73, 0x03c1, 0x071d, 0x0a2c,
-    0x01c0, 0x08d8, 0x02a5, 0x0806,
-    0x08b2, 0x01ae, 0x022b, 0x034b,
-    0x081e, 0x0367, 0x060e, 0x0069,
-    0x01a6, 0x024b, 0x00b1, 0x0c16,
-    0x0bde, 0x0b35, 0x0626, 0x0675,
-    0x0c0b, 0x030a, 0x0487, 0x0c6e,
-    0x09f8, 0x05cb, 0x0aa7, 0x045f,
-    0x06cb, 0x0284, 0x0999, 0x015d,
-    0x01a2, 0x0149, 0x0c65, 0x0cb6,
-    0x0331, 0x0449, 0x025b, 0x0262,
-    0x052a, 0x07fc, 0x0748, 0x0180,
-    0x0842, 0x0c79, 0x04c2, 0x07ca,
-    0x0997, 0x00dc, 0x085e, 0x0686,
-    0x0860, 0x0707, 0x0803, 0x031a,
-    0x071b, 0x09ab, 0x099b, 0x01de,
-    0x0c95, 0x0bcd, 0x03e4, 0x03df,
-    0x03be, 0x074d, 0x05f2, 0x065c,
+XALIGNED(4) static const word16 L_mlkem_basemul_mont_zetas[] = {
+    0x08ed, 0x0a0b, 0x0b9a, 0x0714, 0x05d5, 0x058e, 0x011f, 0x00ca,
+    0x0c56, 0x026e, 0x0629, 0x00b6, 0x03c2, 0x084f, 0x073f, 0x05bc,
+    0x023d, 0x07d4, 0x0108, 0x017f, 0x09c4, 0x05b2, 0x06bf, 0x0c7f,
+    0x0a58, 0x03f9, 0x02dc, 0x0260, 0x06fb, 0x019b, 0x0c34, 0x06de,
+    0x04c7, 0x028c, 0x0ad9, 0x03f7, 0x07f4, 0x05d3, 0x0be7, 0x06f9,
+    0x0204, 0x0cf9, 0x0bc1, 0x0a67, 0x06af, 0x0877, 0x007e, 0x05bd,
+    0x09ac, 0x0ca7, 0x0bf2, 0x033e, 0x006b, 0x0774, 0x0c0a, 0x094a,
+    0x0b73, 0x03c1, 0x071d, 0x0a2c, 0x01c0, 0x08d8, 0x02a5, 0x0806,
+    0x08b2, 0x01ae, 0x022b, 0x034b, 0x081e, 0x0367, 0x060e, 0x0069,
+    0x01a6, 0x024b, 0x00b1, 0x0c16, 0x0bde, 0x0b35, 0x0626, 0x0675,
+    0x0c0b, 0x030a, 0x0487, 0x0c6e, 0x09f8, 0x05cb, 0x0aa7, 0x045f,
+    0x06cb, 0x0284, 0x0999, 0x015d, 0x01a2, 0x0149, 0x0c65, 0x0cb6,
+    0x0331, 0x0449, 0x025b, 0x0262, 0x052a, 0x07fc, 0x0748, 0x0180,
+    0x0842, 0x0c79, 0x04c2, 0x07ca, 0x0997, 0x00dc, 0x085e, 0x0686,
+    0x0860, 0x0707, 0x0803, 0x031a, 0x071b, 0x09ab, 0x099b, 0x01de,
+    0x0c95, 0x0bcd, 0x03e4, 0x03df, 0x03be, 0x074d, 0x05f2, 0x065c,
 };
 
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
@@ -7591,13 +7541,13 @@ WC_OMIT_FRAME_POINTER void mlkem_arm32_basemul_mont(sword16* r_p,
 #else
 WC_OMIT_FRAME_POINTER void mlkem_arm32_basemul_mont(sword16* r,
     const sword16* a, const sword16* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sword16* r asm ("r0") = (sword16*)r_p;
-    register const sword16* a asm ("r1") = (const sword16*)a_p;
-    register const sword16* b asm ("r2") = (const sword16*)b_p;
-    register word16* L_mlkem_basemul_mont_zetas_c asm ("r3") =
+    register sword16* r __asm__ ("r0") = (sword16*)r_p;
+    register const sword16* a __asm__ ("r1") = (const sword16*)a_p;
+    register const sword16* b __asm__ ("r2") = (const sword16*)b_p;
+    register word16* L_mlkem_basemul_mont_zetas_c __asm__ ("r3") =
         (word16*)&L_mlkem_basemul_mont_zetas;
 #else
     register word16* L_mlkem_basemul_mont_zetas_c =
@@ -7623,7 +7573,7 @@ WC_OMIT_FRAME_POINTER void mlkem_arm32_basemul_mont(sword16* r,
 #endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH >= 6 */
         "mov	r8, #0\n\t"
         "\n"
-    "L_mlkem_basemul_mont_loop_%=: \n\t"
+    "L_mlkem_basemul_mont_loop_%=:\n\t"
         "ldm	%[a]!, {r4, r5}\n\t"
         "ldm	%[b]!, {r6, r7}\n\t"
         "ldr	lr, [r3, r8]\n\t"
@@ -7896,13 +7846,13 @@ WC_OMIT_FRAME_POINTER void mlkem_arm32_basemul_mont_add(sword16* r_p,
 #else
 WC_OMIT_FRAME_POINTER void mlkem_arm32_basemul_mont_add(sword16* r,
     const sword16* a, const sword16* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sword16* r asm ("r0") = (sword16*)r_p;
-    register const sword16* a asm ("r1") = (const sword16*)a_p;
-    register const sword16* b asm ("r2") = (const sword16*)b_p;
-    register word16* L_mlkem_basemul_mont_zetas_c asm ("r3") =
+    register sword16* r __asm__ ("r0") = (sword16*)r_p;
+    register const sword16* a __asm__ ("r1") = (const sword16*)a_p;
+    register const sword16* b __asm__ ("r2") = (const sword16*)b_p;
+    register word16* L_mlkem_basemul_mont_zetas_c __asm__ ("r3") =
         (word16*)&L_mlkem_basemul_mont_zetas;
 #else
     register word16* L_mlkem_basemul_mont_zetas_c =
@@ -7928,7 +7878,7 @@ WC_OMIT_FRAME_POINTER void mlkem_arm32_basemul_mont_add(sword16* r,
 #endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH >= 6 */
         "mov	r8, #0\n\t"
         "\n"
-    "L_mlkem_arm32_basemul_mont_add_loop_%=: \n\t"
+    "L_mlkem_arm32_basemul_mont_add_loop_%=:\n\t"
         "ldm	%[a]!, {r4, r5}\n\t"
         "ldm	%[b]!, {r6, r7}\n\t"
         "ldr	lr, [r3, r8]\n\t"
@@ -8233,11 +8183,11 @@ WC_OMIT_FRAME_POINTER void mlkem_arm32_basemul_mont_add(sword16* r,
 WC_OMIT_FRAME_POINTER void mlkem_arm32_csubq(sword16* p_p)
 #else
 WC_OMIT_FRAME_POINTER void mlkem_arm32_csubq(sword16* p)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sword16* p asm ("r0") = (sword16*)p_p;
-    register word16* L_mlkem_basemul_mont_zetas_c asm ("r1") =
+    register sword16* p __asm__ ("r0") = (sword16*)p_p;
+    register word16* L_mlkem_basemul_mont_zetas_c __asm__ ("r1") =
         (word16*)&L_mlkem_basemul_mont_zetas;
 #else
     register word16* L_mlkem_basemul_mont_zetas_c =
@@ -8273,7 +8223,7 @@ WC_OMIT_FRAME_POINTER void mlkem_arm32_csubq(sword16* p)
 #endif
         "mov	r1, #0x100\n\t"
         "\n"
-    "L_mlkem_arm32_csubq_loop_%=: \n\t"
+    "L_mlkem_arm32_csubq_loop_%=:\n\t"
         "ldm	%[p], {r2, r3, r4, r5}\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH >= 6)
         "ssub16	r2, r2, lr\n\t"
@@ -8429,14 +8379,14 @@ WC_OMIT_FRAME_POINTER unsigned int mlkem_arm32_rej_uniform(sword16* p_p,
 #else
 WC_OMIT_FRAME_POINTER unsigned int mlkem_arm32_rej_uniform(sword16* p,
     unsigned int len, const byte* r, unsigned int rLen)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sword16* p asm ("r0") = (sword16*)p_p;
-    register unsigned int len asm ("r1") = (unsigned int)len_p;
-    register const byte* r asm ("r2") = (const byte*)r_p;
-    register unsigned int rLen asm ("r3") = (unsigned int)rLen_p;
-    register word16* L_mlkem_basemul_mont_zetas_c asm ("r12") =
+    register sword16* p __asm__ ("r0") = (sword16*)p_p;
+    register unsigned int len __asm__ ("r1") = (unsigned int)len_p;
+    register const byte* r __asm__ ("r2") = (const byte*)r_p;
+    register unsigned int rLen __asm__ ("r3") = (unsigned int)rLen_p;
+    register word16* L_mlkem_basemul_mont_zetas_c __asm__ ("r12") =
         (word16*)&L_mlkem_basemul_mont_zetas;
 #else
     register word16* L_mlkem_basemul_mont_zetas_c =
@@ -8453,7 +8403,7 @@ WC_OMIT_FRAME_POINTER unsigned int mlkem_arm32_rej_uniform(sword16* p,
 #endif
         "mov	r12, #0\n\t"
         "\n"
-    "L_mlkem_arm32_rej_uniform_loop_no_fail_%=: \n\t"
+    "L_mlkem_arm32_rej_uniform_loop_no_fail_%=:\n\t"
         "cmp	%[len], #8\n\t"
         "blt	L_mlkem_arm32_rej_uniform_done_no_fail_%=\n\t"
         "ldm	%[r]!, {r4, r5, r6}\n\t"
@@ -8562,11 +8512,11 @@ WC_OMIT_FRAME_POINTER unsigned int mlkem_arm32_rej_uniform(sword16* p,
         "bne	L_mlkem_arm32_rej_uniform_loop_no_fail_%=\n\t"
         "b	L_mlkem_arm32_rej_uniform_done_%=\n\t"
         "\n"
-    "L_mlkem_arm32_rej_uniform_done_no_fail_%=: \n\t"
+    "L_mlkem_arm32_rej_uniform_done_no_fail_%=:\n\t"
         "cmp	%[len], #0\n\t"
         "beq	L_mlkem_arm32_rej_uniform_done_%=\n\t"
         "\n"
-    "L_mlkem_arm32_rej_uniform_loop_%=: \n\t"
+    "L_mlkem_arm32_rej_uniform_loop_%=:\n\t"
         "ldm	%[r]!, {r4, r5, r6}\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
         "lsl	r7, r4, #20\n\t"
@@ -8581,7 +8531,7 @@ WC_OMIT_FRAME_POINTER unsigned int mlkem_arm32_rej_uniform(sword16* p,
         "add	r12, r12, #2\n\t"
         "beq	L_mlkem_arm32_rej_uniform_done_%=\n\t"
         "\n"
-    "L_mlkem_arm32_rej_uniform_fail_0_%=: \n\t"
+    "L_mlkem_arm32_rej_uniform_fail_0_%=:\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
         "lsl	r7, r4, #8\n\t"
         "lsr	r7, r7, #20\n\t"
@@ -8595,7 +8545,7 @@ WC_OMIT_FRAME_POINTER unsigned int mlkem_arm32_rej_uniform(sword16* p,
         "add	r12, r12, #2\n\t"
         "beq	L_mlkem_arm32_rej_uniform_done_%=\n\t"
         "\n"
-    "L_mlkem_arm32_rej_uniform_fail_1_%=: \n\t"
+    "L_mlkem_arm32_rej_uniform_fail_1_%=:\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
         "lsr	r7, r4, #24\n\t"
 #else
@@ -8616,7 +8566,7 @@ WC_OMIT_FRAME_POINTER unsigned int mlkem_arm32_rej_uniform(sword16* p,
         "add	r12, r12, #2\n\t"
         "beq	L_mlkem_arm32_rej_uniform_done_%=\n\t"
         "\n"
-    "L_mlkem_arm32_rej_uniform_fail_2_%=: \n\t"
+    "L_mlkem_arm32_rej_uniform_fail_2_%=:\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
         "lsl	r7, r5, #16\n\t"
         "lsr	r7, r7, #20\n\t"
@@ -8630,7 +8580,7 @@ WC_OMIT_FRAME_POINTER unsigned int mlkem_arm32_rej_uniform(sword16* p,
         "add	r12, r12, #2\n\t"
         "beq	L_mlkem_arm32_rej_uniform_done_%=\n\t"
         "\n"
-    "L_mlkem_arm32_rej_uniform_fail_3_%=: \n\t"
+    "L_mlkem_arm32_rej_uniform_fail_3_%=:\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
         "lsl	r7, r5, #4\n\t"
         "lsr	r7, r7, #20\n\t"
@@ -8644,7 +8594,7 @@ WC_OMIT_FRAME_POINTER unsigned int mlkem_arm32_rej_uniform(sword16* p,
         "add	r12, r12, #2\n\t"
         "beq	L_mlkem_arm32_rej_uniform_done_%=\n\t"
         "\n"
-    "L_mlkem_arm32_rej_uniform_fail_4_%=: \n\t"
+    "L_mlkem_arm32_rej_uniform_fail_4_%=:\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
         "lsr	r7, r5, #28\n\t"
 #else
@@ -8665,7 +8615,7 @@ WC_OMIT_FRAME_POINTER unsigned int mlkem_arm32_rej_uniform(sword16* p,
         "add	r12, r12, #2\n\t"
         "beq	L_mlkem_arm32_rej_uniform_done_%=\n\t"
         "\n"
-    "L_mlkem_arm32_rej_uniform_fail_5_%=: \n\t"
+    "L_mlkem_arm32_rej_uniform_fail_5_%=:\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
         "lsl	r7, r6, #12\n\t"
         "lsr	r7, r7, #20\n\t"
@@ -8679,7 +8629,7 @@ WC_OMIT_FRAME_POINTER unsigned int mlkem_arm32_rej_uniform(sword16* p,
         "add	r12, r12, #2\n\t"
         "beq	L_mlkem_arm32_rej_uniform_done_%=\n\t"
         "\n"
-    "L_mlkem_arm32_rej_uniform_fail_6_%=: \n\t"
+    "L_mlkem_arm32_rej_uniform_fail_6_%=:\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
         "lsr	r7, r6, #20\n\t"
 #else
@@ -8692,11 +8642,11 @@ WC_OMIT_FRAME_POINTER unsigned int mlkem_arm32_rej_uniform(sword16* p,
         "add	r12, r12, #2\n\t"
         "beq	L_mlkem_arm32_rej_uniform_done_%=\n\t"
         "\n"
-    "L_mlkem_arm32_rej_uniform_fail_7_%=: \n\t"
+    "L_mlkem_arm32_rej_uniform_fail_7_%=:\n\t"
         "subs	%[rLen], %[rLen], #12\n\t"
         "bgt	L_mlkem_arm32_rej_uniform_loop_%=\n\t"
         "\n"
-    "L_mlkem_arm32_rej_uniform_done_%=: \n\t"
+    "L_mlkem_arm32_rej_uniform_done_%=:\n\t"
         "lsr	r0, r12, #1\n\t"
         "pop	{%[L_mlkem_basemul_mont_zetas]}\n\t"
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
diff --git a/wolfcrypt/src/port/arm/armv8-32-poly1305-asm.S b/wolfcrypt/src/port/arm/armv8-32-poly1305-asm.S
index bcc3305e137..f67031475bd 100644
--- a/wolfcrypt/src/port/arm/armv8-32-poly1305-asm.S
+++ b/wolfcrypt/src/port/arm/armv8-32-poly1305-asm.S
@@ -246,15 +246,21 @@ L_poly1305_arm32_16_done:
 	add	sp, sp, #28
 	pop	{r4, r5, r6, r7, r8, r9, r10, r11, pc}
 	.size	poly1305_arm32_blocks_16,.-poly1305_arm32_blocks_16
+#ifndef __APPLE__
 	.text
 	.type	L_poly1305_arm32_clamp, %object
 	.size	L_poly1305_arm32_clamp, 16
-	.align	4
+#else
+	.section	__DATA,__data
+#endif /* __APPLE__ */
+	# 8-byte aligned, 64-bit aligned
+#ifndef __APPLE__
+	.align	3
+#else
+	.p2align	3
+#endif /* __APPLE__ */
 L_poly1305_arm32_clamp:
-	.word	0xfffffff
-	.word	0xffffffc
-	.word	0xffffffc
-	.word	0xffffffc
+	.long	0x0fffffff,0x0ffffffc,0x0ffffffc,0x0ffffffc
 	.text
 	.align	4
 	.globl	poly1305_set_key
@@ -822,7 +828,7 @@ L_poly1305_arm32_blocks_start_1:
 	adcs	r9, r9, r4
 	adcs	r10, r10, r5
 	adc	r11, r11, r12
-	push	{r0, r1}
+	push	{r0-r1}
 	add	r1, r0, #0
 	add	lr, r0, #16
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6)
@@ -996,7 +1002,7 @@ L_poly1305_arm32_blocks_start_1:
 	adcs	r9, r9, r5
 	adcs	r10, r10, r6
 	adc	r11, r11, r12
-	pop	{r0, r1}
+	pop	{r0-r1}
 	pop	{r2}
 	add	r12, r0, #16
 	stm	r12, {r7, r8, r9, r10, r11}
@@ -1004,15 +1010,21 @@ L_poly1305_arm32_blocks_done:
 	vpop	{d8-d15}
 	pop	{r4, r5, r6, r7, r8, r9, r10, r11, pc}
 	.size	poly1305_arm32_blocks,.-poly1305_arm32_blocks
+#ifndef __APPLE__
 	.text
 	.type	L_poly1305_arm32_clamp, %object
 	.size	L_poly1305_arm32_clamp, 16
-	.align	4
+#else
+	.section	__DATA,__data
+#endif /* __APPLE__ */
+	# 8-byte aligned, 64-bit aligned
+#ifndef __APPLE__
+	.align	3
+#else
+	.p2align	3
+#endif /* __APPLE__ */
 L_poly1305_arm32_clamp:
-	.word	0xfffffff
-	.word	0xffffffc
-	.word	0xffffffc
-	.word	0xffffffc
+	.long	0x0fffffff,0x0ffffffc,0x0ffffffc,0x0ffffffc
 	.text
 	.align	4
 	.globl	poly1305_set_key
@@ -1059,7 +1071,7 @@ poly1305_set_key:
 	vmov.i32	s5, r9
 	vmov.i32	s7, r10
 	vmov.i32	s9, r11
-	push	{r0, r1}
+	push	{r0-r1}
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6)
 	# Square r
 	umull	r1, r6, r2, r3
@@ -1158,7 +1170,7 @@ poly1305_set_key:
 	vmov.i32	s4, r4
 	vmov.i32	s6, r5
 	vmov.i32	s8, r10
-	pop	{r0, r1}
+	pop	{r0-r1}
 	add	lr, r0, #0x7c
 	vstm.32	lr, {d0-d4}
 	# Multiply r^2, r by r^2
diff --git a/wolfcrypt/src/port/arm/armv8-32-poly1305-asm_c.c b/wolfcrypt/src/port/arm/armv8-32-poly1305-asm_c.c
index 7a8a1e06111..c71c3acf8c4 100644
--- a/wolfcrypt/src/port/arm/armv8-32-poly1305-asm_c.c
+++ b/wolfcrypt/src/port/arm/armv8-32-poly1305-asm_c.c
@@ -30,8 +30,6 @@
 
 #ifdef WOLFSSL_ARMASM
 #if !defined(__aarch64__) && !defined(WOLFSSL_ARMASM_THUMB2)
-#include <stdint.h>
-#include <wolfssl/wolfcrypt/libwolfssl_sources.h>
 #ifdef WOLFSSL_ARMASM_INLINE
 
 #ifdef __IAR_SYSTEMS_ICC__
@@ -59,13 +57,13 @@ WC_OMIT_FRAME_POINTER void poly1305_arm32_blocks_16(Poly1305* ctx_p,
 #else
 WC_OMIT_FRAME_POINTER void poly1305_arm32_blocks_16(Poly1305* ctx,
     const byte* m, word32 len, int notLast)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register Poly1305* ctx asm ("r0") = (Poly1305*)ctx_p;
-    register const byte* m asm ("r1") = (const byte*)m_p;
-    register word32 len asm ("r2") = (word32)len_p;
-    register int notLast asm ("r3") = (int)notLast_p;
+    register Poly1305* ctx __asm__ ("r0") = (Poly1305*)ctx_p;
+    register const byte* m __asm__ ("r1") = (const byte*)m_p;
+    register word32 len __asm__ ("r2") = (word32)len_p;
+    register int notLast __asm__ ("r3") = (int)notLast_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -78,7 +76,7 @@ WC_OMIT_FRAME_POINTER void poly1305_arm32_blocks_16(Poly1305* ctx,
         "add	lr, %[ctx], #16\n\t"
         "ldm	lr, {r4, r5, r6, r7, r8}\n\t"
         "\n"
-    "L_poly1305_arm32_16_loop_%=: \n\t"
+    "L_poly1305_arm32_16_loop_%=:\n\t"
         /* Add m to h */
         "ldr	%[m], [sp, #16]\n\t"
         "ldr	%[len], [%[m]]\n\t"
@@ -275,7 +273,7 @@ WC_OMIT_FRAME_POINTER void poly1305_arm32_blocks_16(Poly1305* ctx,
         "bgt	L_poly1305_arm32_16_loop_%=\n\t"
         "stm	lr, {r4, r5, r6, r7, r8}\n\t"
         "\n"
-    "L_poly1305_arm32_16_done_%=: \n\t"
+    "L_poly1305_arm32_16_done_%=:\n\t"
         "add	sp, sp, #28\n\t"
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [ctx] "+r" (ctx), [m] "+r" (m), [len] "+r" (len),
@@ -291,7 +289,7 @@ WC_OMIT_FRAME_POINTER void poly1305_arm32_blocks_16(Poly1305* ctx,
     );
 }
 
-static const word32 L_poly1305_arm32_clamp[] = {
+XALIGNED(8) static const word32 L_poly1305_arm32_clamp[] = {
     0x0fffffff, 0x0ffffffc, 0x0ffffffc, 0x0ffffffc,
 };
 
@@ -299,12 +297,12 @@ static const word32 L_poly1305_arm32_clamp[] = {
 WC_OMIT_FRAME_POINTER void poly1305_set_key(Poly1305* ctx_p, const byte* key_p)
 #else
 WC_OMIT_FRAME_POINTER void poly1305_set_key(Poly1305* ctx, const byte* key)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register Poly1305* ctx asm ("r0") = (Poly1305*)ctx_p;
-    register const byte* key asm ("r1") = (const byte*)key_p;
-    register word32* L_poly1305_arm32_clamp_c asm ("r2") =
+    register Poly1305* ctx __asm__ ("r0") = (Poly1305*)ctx_p;
+    register const byte* key __asm__ ("r1") = (const byte*)key_p;
+    register word32* L_poly1305_arm32_clamp_c __asm__ ("r2") =
         (word32*)&L_poly1305_arm32_clamp;
 #else
     register word32* L_poly1305_arm32_clamp_c =
@@ -360,11 +358,11 @@ WC_OMIT_FRAME_POINTER void poly1305_set_key(Poly1305* ctx, const byte* key)
 WC_OMIT_FRAME_POINTER void poly1305_final(Poly1305* ctx_p, byte* mac_p)
 #else
 WC_OMIT_FRAME_POINTER void poly1305_final(Poly1305* ctx, byte* mac)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register Poly1305* ctx asm ("r0") = (Poly1305*)ctx_p;
-    register byte* mac asm ("r1") = (byte*)mac_p;
+    register Poly1305* ctx __asm__ ("r0") = (Poly1305*)ctx_p;
+    register byte* mac __asm__ ("r1") = (byte*)mac_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -430,13 +428,13 @@ WC_OMIT_FRAME_POINTER void poly1305_arm32_blocks_16(Poly1305* ctx_p,
 #else
 WC_OMIT_FRAME_POINTER void poly1305_arm32_blocks_16(Poly1305* ctx,
     const byte* m, word32 len, int notLast)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register Poly1305* ctx asm ("r0") = (Poly1305*)ctx_p;
-    register const byte* m asm ("r1") = (const byte*)m_p;
-    register word32 len asm ("r2") = (word32)len_p;
-    register int notLast asm ("r3") = (int)notLast_p;
+    register Poly1305* ctx __asm__ ("r0") = (Poly1305*)ctx_p;
+    register const byte* m __asm__ ("r1") = (const byte*)m_p;
+    register word32 len __asm__ ("r2") = (word32)len_p;
+    register int notLast __asm__ ("r3") = (int)notLast_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -449,7 +447,7 @@ WC_OMIT_FRAME_POINTER void poly1305_arm32_blocks_16(Poly1305* ctx,
         "add	lr, %[ctx], #16\n\t"
         "ldm	lr, {r4, r5, r6, r7, r8}\n\t"
         "\n"
-    "L_poly1305_arm32_16_loop_%=: \n\t"
+    "L_poly1305_arm32_16_loop_%=:\n\t"
         /* Add m to h */
         "ldr	%[m], [sp, #16]\n\t"
         "ldr	%[len], [%[m]]\n\t"
@@ -646,7 +644,7 @@ WC_OMIT_FRAME_POINTER void poly1305_arm32_blocks_16(Poly1305* ctx,
         "bgt	L_poly1305_arm32_16_loop_%=\n\t"
         "stm	lr, {r4, r5, r6, r7, r8}\n\t"
         "\n"
-    "L_poly1305_arm32_16_done_%=: \n\t"
+    "L_poly1305_arm32_16_done_%=:\n\t"
         "add	sp, sp, #28\n\t"
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [ctx] "+r" (ctx), [m] "+r" (m), [len] "+r" (len),
@@ -668,12 +666,12 @@ WC_OMIT_FRAME_POINTER void poly1305_arm32_blocks(Poly1305* ctx_p,
 #else
 WC_OMIT_FRAME_POINTER void poly1305_arm32_blocks(Poly1305* ctx,
     const unsigned char* m, size_t bytes)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register Poly1305* ctx asm ("r0") = (Poly1305*)ctx_p;
-    register const unsigned char* m asm ("r1") = (const unsigned char*)m_p;
-    register size_t bytes asm ("r2") = (size_t)bytes_p;
+    register Poly1305* ctx __asm__ ("r0") = (Poly1305*)ctx_p;
+    register const unsigned char* m __asm__ ("r1") = (const unsigned char*)m_p;
+    register size_t bytes __asm__ ("r2") = (size_t)bytes_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -683,7 +681,7 @@ WC_OMIT_FRAME_POINTER void poly1305_arm32_blocks(Poly1305* ctx,
         "ldm	r12, {r7, r8, r9, r10, r11}\n\t"
         "b	L_poly1305_arm32_blocks_start_1_%=\n\t"
         "\n"
-    "L_poly1305_arm32_blocks_begin_neon_%=: \n\t"
+    "L_poly1305_arm32_blocks_begin_neon_%=:\n\t"
         "vmov.i16	q15, #0xffff\n\t"
         "vshr.u64	q15, q15, #38\n\t"
         "vld1.64	{d0-d2}, [r12]\n\t"
@@ -711,11 +709,11 @@ WC_OMIT_FRAME_POINTER void poly1305_arm32_blocks(Poly1305* ctx,
         "vadd.u32	d9, d9, d24\n\t"
         "b	L_poly1305_arm32_blocks_start_2_%=\n\t"
         "\n"
-    "L_poly1305_arm32_blocks_begin_4_%=: \n\t"
+    "L_poly1305_arm32_blocks_begin_4_%=:\n\t"
         "add	r3, %[ctx], #0xa4\n\t"
         "vldm.32	r3, {d26-d30}\n\t"
         "\n"
-    "L_poly1305_arm32_blocks_start_4_%=: \n\t"
+    "L_poly1305_arm32_blocks_start_4_%=:\n\t"
         "sub	%[bytes], #0x40\n\t"
         "vld4.32	{d10-d13}, [%[m]]!\n\t"
         "vshl.u32	d6, d27, #2\n\t"
@@ -837,7 +835,7 @@ WC_OMIT_FRAME_POINTER void poly1305_arm32_blocks(Poly1305* ctx,
         "cmp	%[bytes], #32\n\t"
         "blt	L_poly1305_arm32_blocks_done_neon_%=\n\t"
         "\n"
-    "L_poly1305_arm32_blocks_start_2_%=: \n\t"
+    "L_poly1305_arm32_blocks_start_2_%=:\n\t"
         "sub	%[bytes], #32\n\t"
         "vld4.32	{d10-d13}, [%[m]]!\n\t"
         "vshr.u32	d14, d13, #8\n\t"
@@ -901,7 +899,7 @@ WC_OMIT_FRAME_POINTER void poly1305_arm32_blocks(Poly1305* ctx,
         "vsra.u64	d1, d0, #26\n\t"
         "vand.u64	d0, d0, d31\n\t"
         "\n"
-    "L_poly1305_arm32_blocks_done_neon_%=: \n\t"
+    "L_poly1305_arm32_blocks_done_neon_%=:\n\t"
         "cmp	%[bytes], #16\n\t"
         "beq	L_poly1305_arm32_blocks_begin_1_%=\n\t"
         "add	r12, %[ctx], #16\n\t"
@@ -914,7 +912,7 @@ WC_OMIT_FRAME_POINTER void poly1305_arm32_blocks(Poly1305* ctx,
         "vst1.64	{d0-d2}, [r12]\n\t"
         "b	L_poly1305_arm32_blocks_done_%=\n\t"
         "\n"
-    "L_poly1305_arm32_blocks_begin_1_%=: \n\t"
+    "L_poly1305_arm32_blocks_begin_1_%=:\n\t"
         "vsli.u64	d0, d1, #26\n\t"
         "vsli.u64	d0, d2, #52\n\t"
         "vshr.u64	d1, d2, #12\n\t"
@@ -925,7 +923,7 @@ WC_OMIT_FRAME_POINTER void poly1305_arm32_blocks(Poly1305* ctx,
         "vmov	r9, r10, d1\n\t"
         "vmov	r11, d2[0]\n\t"
         "\n"
-    "L_poly1305_arm32_blocks_start_1_%=: \n\t"
+    "L_poly1305_arm32_blocks_start_1_%=:\n\t"
         "mov	r12, #1\n\t"
         "push	{r2}\n\t"
         /* Load message */
@@ -1118,7 +1116,7 @@ WC_OMIT_FRAME_POINTER void poly1305_arm32_blocks(Poly1305* ctx,
         "add	r12, %[ctx], #16\n\t"
         "stm	r12, {r7, r8, r9, r10, r11}\n\t"
         "\n"
-    "L_poly1305_arm32_blocks_done_%=: \n\t"
+    "L_poly1305_arm32_blocks_done_%=:\n\t"
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [ctx] "+r" (ctx), [m] "+r" (m), [bytes] "+r" (bytes)
         :
@@ -1134,7 +1132,7 @@ WC_OMIT_FRAME_POINTER void poly1305_arm32_blocks(Poly1305* ctx,
     );
 }
 
-static const word32 L_poly1305_arm32_clamp[] = {
+XALIGNED(8) static const word32 L_poly1305_arm32_clamp[] = {
     0x0fffffff, 0x0ffffffc, 0x0ffffffc, 0x0ffffffc,
 };
 
@@ -1142,12 +1140,12 @@ static const word32 L_poly1305_arm32_clamp[] = {
 WC_OMIT_FRAME_POINTER void poly1305_set_key(Poly1305* ctx_p, const byte* key_p)
 #else
 WC_OMIT_FRAME_POINTER void poly1305_set_key(Poly1305* ctx, const byte* key)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register Poly1305* ctx asm ("r0") = (Poly1305*)ctx_p;
-    register const byte* key asm ("r1") = (const byte*)key_p;
-    register word32* L_poly1305_arm32_clamp_c asm ("r2") =
+    register Poly1305* ctx __asm__ ("r0") = (Poly1305*)ctx_p;
+    register const byte* key __asm__ ("r1") = (const byte*)key_p;
+    register word32* L_poly1305_arm32_clamp_c __asm__ ("r2") =
         (word32*)&L_poly1305_arm32_clamp;
 #else
     register word32* L_poly1305_arm32_clamp_c =
@@ -1383,11 +1381,11 @@ WC_OMIT_FRAME_POINTER void poly1305_set_key(Poly1305* ctx, const byte* key)
 WC_OMIT_FRAME_POINTER void poly1305_final(Poly1305* ctx_p, byte* mac_p)
 #else
 WC_OMIT_FRAME_POINTER void poly1305_final(Poly1305* ctx, byte* mac)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register Poly1305* ctx asm ("r0") = (Poly1305*)ctx_p;
-    register byte* mac asm ("r1") = (byte*)mac_p;
+    register Poly1305* ctx __asm__ ("r0") = (Poly1305*)ctx_p;
+    register byte* mac __asm__ ("r1") = (byte*)mac_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
diff --git a/wolfcrypt/src/port/arm/armv8-32-sha256-asm.S b/wolfcrypt/src/port/arm/armv8-32-sha256-asm.S
index 510f6016c28..e0883cc7deb 100644
--- a/wolfcrypt/src/port/arm/armv8-32-sha256-asm.S
+++ b/wolfcrypt/src/port/arm/armv8-32-sha256-asm.S
@@ -32,75 +32,36 @@
 #ifndef WOLFSSL_ARMASM_INLINE
 #ifndef NO_SHA256
 #ifdef WOLFSSL_ARMASM_NO_NEON
+#ifndef __APPLE__
 	.text
 	.type	L_SHA256_transform_len_k, %object
 	.size	L_SHA256_transform_len_k, 256
-	.align	4
+#else
+	.section	__DATA,__data
+#endif /* __APPLE__ */
+	# 8-byte aligned, 64-bit aligned
+#ifndef __APPLE__
+	.align	3
+#else
+	.p2align	3
+#endif /* __APPLE__ */
 L_SHA256_transform_len_k:
-	.word	0x428a2f98
-	.word	0x71374491
-	.word	0xb5c0fbcf
-	.word	0xe9b5dba5
-	.word	0x3956c25b
-	.word	0x59f111f1
-	.word	0x923f82a4
-	.word	0xab1c5ed5
-	.word	0xd807aa98
-	.word	0x12835b01
-	.word	0x243185be
-	.word	0x550c7dc3
-	.word	0x72be5d74
-	.word	0x80deb1fe
-	.word	0x9bdc06a7
-	.word	0xc19bf174
-	.word	0xe49b69c1
-	.word	0xefbe4786
-	.word	0xfc19dc6
-	.word	0x240ca1cc
-	.word	0x2de92c6f
-	.word	0x4a7484aa
-	.word	0x5cb0a9dc
-	.word	0x76f988da
-	.word	0x983e5152
-	.word	0xa831c66d
-	.word	0xb00327c8
-	.word	0xbf597fc7
-	.word	0xc6e00bf3
-	.word	0xd5a79147
-	.word	0x6ca6351
-	.word	0x14292967
-	.word	0x27b70a85
-	.word	0x2e1b2138
-	.word	0x4d2c6dfc
-	.word	0x53380d13
-	.word	0x650a7354
-	.word	0x766a0abb
-	.word	0x81c2c92e
-	.word	0x92722c85
-	.word	0xa2bfe8a1
-	.word	0xa81a664b
-	.word	0xc24b8b70
-	.word	0xc76c51a3
-	.word	0xd192e819
-	.word	0xd6990624
-	.word	0xf40e3585
-	.word	0x106aa070
-	.word	0x19a4c116
-	.word	0x1e376c08
-	.word	0x2748774c
-	.word	0x34b0bcb5
-	.word	0x391c0cb3
-	.word	0x4ed8aa4a
-	.word	0x5b9cca4f
-	.word	0x682e6ff3
-	.word	0x748f82ee
-	.word	0x78a5636f
-	.word	0x84c87814
-	.word	0x8cc70208
-	.word	0x90befffa
-	.word	0xa4506ceb
-	.word	0xbef9a3f7
-	.word	0xc67178f2
+	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 	.text
 	.align	4
 	.globl	Transform_Sha256_Len_base
@@ -2568,75 +2529,36 @@ L_SHA256_transform_len_blk_end_15:
 	.size	Transform_Sha256_Len_base,.-Transform_Sha256_Len_base
 #else
 #ifdef WOLFSSL_ARMASM_NO_HW_CRYPTO
+#ifndef __APPLE__
 	.text
 	.type	L_SHA256_transform_neon_len_k, %object
 	.size	L_SHA256_transform_neon_len_k, 256
-	.align	4
+#else
+	.section	__DATA,__data
+#endif /* __APPLE__ */
+	# 8-byte aligned, 64-bit aligned
+#ifndef __APPLE__
+	.align	3
+#else
+	.p2align	3
+#endif /* __APPLE__ */
 L_SHA256_transform_neon_len_k:
-	.word	0x428a2f98
-	.word	0x71374491
-	.word	0xb5c0fbcf
-	.word	0xe9b5dba5
-	.word	0x3956c25b
-	.word	0x59f111f1
-	.word	0x923f82a4
-	.word	0xab1c5ed5
-	.word	0xd807aa98
-	.word	0x12835b01
-	.word	0x243185be
-	.word	0x550c7dc3
-	.word	0x72be5d74
-	.word	0x80deb1fe
-	.word	0x9bdc06a7
-	.word	0xc19bf174
-	.word	0xe49b69c1
-	.word	0xefbe4786
-	.word	0xfc19dc6
-	.word	0x240ca1cc
-	.word	0x2de92c6f
-	.word	0x4a7484aa
-	.word	0x5cb0a9dc
-	.word	0x76f988da
-	.word	0x983e5152
-	.word	0xa831c66d
-	.word	0xb00327c8
-	.word	0xbf597fc7
-	.word	0xc6e00bf3
-	.word	0xd5a79147
-	.word	0x6ca6351
-	.word	0x14292967
-	.word	0x27b70a85
-	.word	0x2e1b2138
-	.word	0x4d2c6dfc
-	.word	0x53380d13
-	.word	0x650a7354
-	.word	0x766a0abb
-	.word	0x81c2c92e
-	.word	0x92722c85
-	.word	0xa2bfe8a1
-	.word	0xa81a664b
-	.word	0xc24b8b70
-	.word	0xc76c51a3
-	.word	0xd192e819
-	.word	0xd6990624
-	.word	0xf40e3585
-	.word	0x106aa070
-	.word	0x19a4c116
-	.word	0x1e376c08
-	.word	0x2748774c
-	.word	0x34b0bcb5
-	.word	0x391c0cb3
-	.word	0x4ed8aa4a
-	.word	0x5b9cca4f
-	.word	0x682e6ff3
-	.word	0x748f82ee
-	.word	0x78a5636f
-	.word	0x84c87814
-	.word	0x8cc70208
-	.word	0x90befffa
-	.word	0xa4506ceb
-	.word	0xbef9a3f7
-	.word	0xc67178f2
+	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 	.text
 	.align	4
 	.fpu	neon
@@ -3667,75 +3589,36 @@ L_SHA256_transform_neon_len_start:
 	pop	{r4, r5, r6, r7, r8, r9, r10, pc}
 	.size	Transform_Sha256_Len_neon,.-Transform_Sha256_Len_neon
 #else
+#ifndef __APPLE__
 	.text
 	.type	L_SHA256_trans_crypto_len_k, %object
 	.size	L_SHA256_trans_crypto_len_k, 256
-	.align	4
+#else
+	.section	__DATA,__data
+#endif /* __APPLE__ */
+	# 8-byte aligned, 64-bit aligned
+#ifndef __APPLE__
+	.align	3
+#else
+	.p2align	3
+#endif /* __APPLE__ */
 L_SHA256_trans_crypto_len_k:
-	.word	0x428a2f98
-	.word	0x71374491
-	.word	0xb5c0fbcf
-	.word	0xe9b5dba5
-	.word	0x3956c25b
-	.word	0x59f111f1
-	.word	0x923f82a4
-	.word	0xab1c5ed5
-	.word	0xd807aa98
-	.word	0x12835b01
-	.word	0x243185be
-	.word	0x550c7dc3
-	.word	0x72be5d74
-	.word	0x80deb1fe
-	.word	0x9bdc06a7
-	.word	0xc19bf174
-	.word	0xe49b69c1
-	.word	0xefbe4786
-	.word	0xfc19dc6
-	.word	0x240ca1cc
-	.word	0x2de92c6f
-	.word	0x4a7484aa
-	.word	0x5cb0a9dc
-	.word	0x76f988da
-	.word	0x983e5152
-	.word	0xa831c66d
-	.word	0xb00327c8
-	.word	0xbf597fc7
-	.word	0xc6e00bf3
-	.word	0xd5a79147
-	.word	0x6ca6351
-	.word	0x14292967
-	.word	0x27b70a85
-	.word	0x2e1b2138
-	.word	0x4d2c6dfc
-	.word	0x53380d13
-	.word	0x650a7354
-	.word	0x766a0abb
-	.word	0x81c2c92e
-	.word	0x92722c85
-	.word	0xa2bfe8a1
-	.word	0xa81a664b
-	.word	0xc24b8b70
-	.word	0xc76c51a3
-	.word	0xd192e819
-	.word	0xd6990624
-	.word	0xf40e3585
-	.word	0x106aa070
-	.word	0x19a4c116
-	.word	0x1e376c08
-	.word	0x2748774c
-	.word	0x34b0bcb5
-	.word	0x391c0cb3
-	.word	0x4ed8aa4a
-	.word	0x5b9cca4f
-	.word	0x682e6ff3
-	.word	0x748f82ee
-	.word	0x78a5636f
-	.word	0x84c87814
-	.word	0x8cc70208
-	.word	0x90befffa
-	.word	0xa4506ceb
-	.word	0xbef9a3f7
-	.word	0xc67178f2
+	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 	.text
 	.align	4
 	.globl	Transform_Sha256_Len_crypto
@@ -3745,14 +3628,14 @@ Transform_Sha256_Len_crypto:
 	adr	r3, L_SHA256_trans_crypto_len_k
 	# Load K into vector registers
 	vldm	r3!, {q8-q11}
-	vldm	r3!, {q12, q13}
+	vldm	r3!, {q12-q13}
 	# Load digest into working vars
-	vldm	r0, {q0, q1}
+	vldm	r0, {q0-q1}
 	# Start of loop processing a block
 L_sha256_len_crypto_begin:
 	# Load W
-	vld1.8	{q4, q5}, [r1]!
-	vld1.8	{q6, q7}, [r1]!
+	vld1.8	{q4-q5}, [r1]!
+	vld1.8	{q6-q7}, [r1]!
 	vrev32.8	q4, q4
 	vrev32.8	q5, q5
 	vrev32.8	q6, q6
@@ -3882,7 +3765,7 @@ L_sha256_len_crypto_begin:
 	sub	r3, r3, #0xa0
 	bne	L_sha256_len_crypto_begin
 	# Store digest back
-	vst1.8	{q0, q1}, [r0]
+	vst1.8	{q0-q1}, [r0]
 	vpop	{d8-d15}
 	bx	lr
 	.size	Transform_Sha256_Len_crypto,.-Transform_Sha256_Len_crypto
diff --git a/wolfcrypt/src/port/arm/armv8-32-sha256-asm_c.c b/wolfcrypt/src/port/arm/armv8-32-sha256-asm_c.c
index 2a8c9299e45..e6cdbe2d202 100644
--- a/wolfcrypt/src/port/arm/armv8-32-sha256-asm_c.c
+++ b/wolfcrypt/src/port/arm/armv8-32-sha256-asm_c.c
@@ -30,8 +30,6 @@
 
 #ifdef WOLFSSL_ARMASM
 #if !defined(__aarch64__) && !defined(WOLFSSL_ARMASM_THUMB2)
-#include <stdint.h>
-#include <wolfssl/wolfcrypt/libwolfssl_sources.h>
 #ifdef WOLFSSL_ARMASM_INLINE
 
 #ifdef __IAR_SYSTEMS_ICC__
@@ -53,7 +51,7 @@
 
 #ifndef NO_SHA256
 #ifdef WOLFSSL_ARMASM_NO_NEON
-static const word32 L_SHA256_transform_len_k[] = {
+XALIGNED(8) static const word32 L_SHA256_transform_len_k[] = {
     0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
     0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
     0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
@@ -80,13 +78,13 @@ WC_OMIT_FRAME_POINTER void Transform_Sha256_Len_base(wc_Sha256* sha256_p,
 #else
 WC_OMIT_FRAME_POINTER void Transform_Sha256_Len_base(wc_Sha256* sha256,
     const byte* data, word32 len)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register wc_Sha256* sha256 asm ("r0") = (wc_Sha256*)sha256_p;
-    register const byte* data asm ("r1") = (const byte*)data_p;
-    register word32 len asm ("r2") = (word32)len_p;
-    register word32* L_SHA256_transform_len_k_c asm ("r3") =
+    register wc_Sha256* sha256 __asm__ ("r0") = (wc_Sha256*)sha256_p;
+    register const byte* data __asm__ ("r1") = (const byte*)data_p;
+    register word32 len __asm__ ("r2") = (word32)len_p;
+    register word32* L_SHA256_transform_len_k_c __asm__ ("r3") =
         (word32*)&L_SHA256_transform_len_k;
 #else
     register word32* L_SHA256_transform_len_k_c =
@@ -146,7 +144,7 @@ WC_OMIT_FRAME_POINTER void Transform_Sha256_Len_base(wc_Sha256* sha256,
 #endif
         /* Start of loop processing a block */
         "\n"
-    "L_SHA256_transform_len_begin_%=: \n\t"
+    "L_SHA256_transform_len_begin_%=:\n\t"
         /* Load, Reverse and Store W - 64 bytes */
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6)
         "ldr	r4, [%[data]]\n\t"
@@ -364,7 +362,7 @@ WC_OMIT_FRAME_POINTER void Transform_Sha256_Len_base(wc_Sha256* sha256,
         "mov	r3, #3\n\t"
         /* Start of 16 rounds */
         "\n"
-    "L_SHA256_transform_len_start_fast_%=: \n\t"
+    "L_SHA256_transform_len_start_fast_%=:\n\t"
         /* Round 0 */
         "ldr	r5, [%[sha256], #16]\n\t"
         "ldr	r6, [%[sha256], #20]\n\t"
@@ -1636,7 +1634,7 @@ WC_OMIT_FRAME_POINTER void Transform_Sha256_Len_base(wc_Sha256* sha256,
         "mov	r3, #4\n\t"
         /* Start of 16 rounds */
         "\n"
-    "L_SHA256_transform_len_start_small_%=: \n\t"
+    "L_SHA256_transform_len_start_small_%=:\n\t"
         "sub	r3, r3, #1\n\t"
         /* Round 0 */
         "ldr	r5, [%[sha256], #16]\n\t"
@@ -1688,7 +1686,7 @@ WC_OMIT_FRAME_POINTER void Transform_Sha256_Len_base(wc_Sha256* sha256,
         "add	r9, r9, r4\n\t"
         "str	r9, [sp]\n\t"
         "\n"
-    "L_SHA256_transform_len_blk_end_0_%=: \n\t"
+    "L_SHA256_transform_len_blk_end_0_%=:\n\t"
         /* Round 1 */
         "ldr	r5, [%[sha256], #12]\n\t"
         "ldr	r6, [%[sha256], #16]\n\t"
@@ -1739,7 +1737,7 @@ WC_OMIT_FRAME_POINTER void Transform_Sha256_Len_base(wc_Sha256* sha256,
         "add	r9, r9, r4\n\t"
         "str	r9, [sp, #4]\n\t"
         "\n"
-    "L_SHA256_transform_len_blk_end_1_%=: \n\t"
+    "L_SHA256_transform_len_blk_end_1_%=:\n\t"
         /* Round 2 */
         "ldr	r5, [%[sha256], #8]\n\t"
         "ldr	r6, [%[sha256], #12]\n\t"
@@ -1790,7 +1788,7 @@ WC_OMIT_FRAME_POINTER void Transform_Sha256_Len_base(wc_Sha256* sha256,
         "add	r9, r9, r4\n\t"
         "str	r9, [sp, #8]\n\t"
         "\n"
-    "L_SHA256_transform_len_blk_end_2_%=: \n\t"
+    "L_SHA256_transform_len_blk_end_2_%=:\n\t"
         /* Round 3 */
         "ldr	r5, [%[sha256], #4]\n\t"
         "ldr	r6, [%[sha256], #8]\n\t"
@@ -1841,7 +1839,7 @@ WC_OMIT_FRAME_POINTER void Transform_Sha256_Len_base(wc_Sha256* sha256,
         "add	r9, r9, r4\n\t"
         "str	r9, [sp, #12]\n\t"
         "\n"
-    "L_SHA256_transform_len_blk_end_3_%=: \n\t"
+    "L_SHA256_transform_len_blk_end_3_%=:\n\t"
         /* Round 4 */
         "ldr	r5, [%[sha256]]\n\t"
         "ldr	r6, [%[sha256], #4]\n\t"
@@ -1892,7 +1890,7 @@ WC_OMIT_FRAME_POINTER void Transform_Sha256_Len_base(wc_Sha256* sha256,
         "add	r9, r9, r4\n\t"
         "str	r9, [sp, #16]\n\t"
         "\n"
-    "L_SHA256_transform_len_blk_end_4_%=: \n\t"
+    "L_SHA256_transform_len_blk_end_4_%=:\n\t"
         /* Round 5 */
         "ldr	r5, [%[sha256], #28]\n\t"
         "ldr	r6, [%[sha256]]\n\t"
@@ -1943,7 +1941,7 @@ WC_OMIT_FRAME_POINTER void Transform_Sha256_Len_base(wc_Sha256* sha256,
         "add	r9, r9, r4\n\t"
         "str	r9, [sp, #20]\n\t"
         "\n"
-    "L_SHA256_transform_len_blk_end_5_%=: \n\t"
+    "L_SHA256_transform_len_blk_end_5_%=:\n\t"
         /* Round 6 */
         "ldr	r5, [%[sha256], #24]\n\t"
         "ldr	r6, [%[sha256], #28]\n\t"
@@ -1994,7 +1992,7 @@ WC_OMIT_FRAME_POINTER void Transform_Sha256_Len_base(wc_Sha256* sha256,
         "add	r9, r9, r4\n\t"
         "str	r9, [sp, #24]\n\t"
         "\n"
-    "L_SHA256_transform_len_blk_end_6_%=: \n\t"
+    "L_SHA256_transform_len_blk_end_6_%=:\n\t"
         /* Round 7 */
         "ldr	r5, [%[sha256], #20]\n\t"
         "ldr	r6, [%[sha256], #24]\n\t"
@@ -2045,7 +2043,7 @@ WC_OMIT_FRAME_POINTER void Transform_Sha256_Len_base(wc_Sha256* sha256,
         "add	r9, r9, r4\n\t"
         "str	r9, [sp, #28]\n\t"
         "\n"
-    "L_SHA256_transform_len_blk_end_7_%=: \n\t"
+    "L_SHA256_transform_len_blk_end_7_%=:\n\t"
         /* Round 8 */
         "ldr	r5, [%[sha256], #16]\n\t"
         "ldr	r6, [%[sha256], #20]\n\t"
@@ -2096,7 +2094,7 @@ WC_OMIT_FRAME_POINTER void Transform_Sha256_Len_base(wc_Sha256* sha256,
         "add	r9, r9, r4\n\t"
         "str	r9, [sp, #32]\n\t"
         "\n"
-    "L_SHA256_transform_len_blk_end_8_%=: \n\t"
+    "L_SHA256_transform_len_blk_end_8_%=:\n\t"
         /* Round 9 */
         "ldr	r5, [%[sha256], #12]\n\t"
         "ldr	r6, [%[sha256], #16]\n\t"
@@ -2147,7 +2145,7 @@ WC_OMIT_FRAME_POINTER void Transform_Sha256_Len_base(wc_Sha256* sha256,
         "add	r9, r9, r4\n\t"
         "str	r9, [sp, #36]\n\t"
         "\n"
-    "L_SHA256_transform_len_blk_end_9_%=: \n\t"
+    "L_SHA256_transform_len_blk_end_9_%=:\n\t"
         /* Round 10 */
         "ldr	r5, [%[sha256], #8]\n\t"
         "ldr	r6, [%[sha256], #12]\n\t"
@@ -2198,7 +2196,7 @@ WC_OMIT_FRAME_POINTER void Transform_Sha256_Len_base(wc_Sha256* sha256,
         "add	r9, r9, r4\n\t"
         "str	r9, [sp, #40]\n\t"
         "\n"
-    "L_SHA256_transform_len_blk_end_10_%=: \n\t"
+    "L_SHA256_transform_len_blk_end_10_%=:\n\t"
         /* Round 11 */
         "ldr	r5, [%[sha256], #4]\n\t"
         "ldr	r6, [%[sha256], #8]\n\t"
@@ -2249,7 +2247,7 @@ WC_OMIT_FRAME_POINTER void Transform_Sha256_Len_base(wc_Sha256* sha256,
         "add	r9, r9, r4\n\t"
         "str	r9, [sp, #44]\n\t"
         "\n"
-    "L_SHA256_transform_len_blk_end_11_%=: \n\t"
+    "L_SHA256_transform_len_blk_end_11_%=:\n\t"
         /* Round 12 */
         "ldr	r5, [%[sha256]]\n\t"
         "ldr	r6, [%[sha256], #4]\n\t"
@@ -2300,7 +2298,7 @@ WC_OMIT_FRAME_POINTER void Transform_Sha256_Len_base(wc_Sha256* sha256,
         "add	r9, r9, r4\n\t"
         "str	r9, [sp, #48]\n\t"
         "\n"
-    "L_SHA256_transform_len_blk_end_12_%=: \n\t"
+    "L_SHA256_transform_len_blk_end_12_%=:\n\t"
         /* Round 13 */
         "ldr	r5, [%[sha256], #28]\n\t"
         "ldr	r6, [%[sha256]]\n\t"
@@ -2351,7 +2349,7 @@ WC_OMIT_FRAME_POINTER void Transform_Sha256_Len_base(wc_Sha256* sha256,
         "add	r9, r9, r4\n\t"
         "str	r9, [sp, #52]\n\t"
         "\n"
-    "L_SHA256_transform_len_blk_end_13_%=: \n\t"
+    "L_SHA256_transform_len_blk_end_13_%=:\n\t"
         /* Round 14 */
         "ldr	r5, [%[sha256], #24]\n\t"
         "ldr	r6, [%[sha256], #28]\n\t"
@@ -2402,7 +2400,7 @@ WC_OMIT_FRAME_POINTER void Transform_Sha256_Len_base(wc_Sha256* sha256,
         "add	r9, r9, r4\n\t"
         "str	r9, [sp, #56]\n\t"
         "\n"
-    "L_SHA256_transform_len_blk_end_14_%=: \n\t"
+    "L_SHA256_transform_len_blk_end_14_%=:\n\t"
         /* Round 15 */
         "ldr	r5, [%[sha256], #20]\n\t"
         "ldr	r6, [%[sha256], #24]\n\t"
@@ -2453,7 +2451,7 @@ WC_OMIT_FRAME_POINTER void Transform_Sha256_Len_base(wc_Sha256* sha256,
         "add	r9, r9, r4\n\t"
         "str	r9, [sp, #60]\n\t"
         "\n"
-    "L_SHA256_transform_len_blk_end_15_%=: \n\t"
+    "L_SHA256_transform_len_blk_end_15_%=:\n\t"
         "cmp	r3, #0\n\t"
         "add	r12, r12, #0x40\n\t"
         "bne	L_SHA256_transform_len_start_small_%=\n\t"
@@ -2586,7 +2584,7 @@ WC_OMIT_FRAME_POINTER void Transform_Sha256_Len_base(wc_Sha256* sha256,
 
 #else
 #ifdef WOLFSSL_ARMASM_NO_HW_CRYPTO
-static const word32 L_SHA256_transform_neon_len_k[] = {
+XALIGNED(8) static const word32 L_SHA256_transform_neon_len_k[] = {
     0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
     0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
     0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
@@ -2613,13 +2611,13 @@ WC_OMIT_FRAME_POINTER void Transform_Sha256_Len_neon(wc_Sha256* sha256_p,
 #else
 WC_OMIT_FRAME_POINTER void Transform_Sha256_Len_neon(wc_Sha256* sha256,
     const byte* data, word32 len)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register wc_Sha256* sha256 asm ("r0") = (wc_Sha256*)sha256_p;
-    register const byte* data asm ("r1") = (const byte*)data_p;
-    register word32 len asm ("r2") = (word32)len_p;
-    register word32* L_SHA256_transform_neon_len_k_c asm ("r3") =
+    register wc_Sha256* sha256 __asm__ ("r0") = (wc_Sha256*)sha256_p;
+    register const byte* data __asm__ ("r1") = (const byte*)data_p;
+    register word32 len __asm__ ("r2") = (word32)len_p;
+    register word32* L_SHA256_transform_neon_len_k_c __asm__ ("r3") =
         (word32*)&L_SHA256_transform_neon_len_k;
 #else
     register word32* L_SHA256_transform_neon_len_k_c =
@@ -2661,7 +2659,7 @@ WC_OMIT_FRAME_POINTER void Transform_Sha256_Len_neon(wc_Sha256* sha256,
 #endif
         /* Start of loop processing a block */
         "\n"
-    "L_SHA256_transform_neon_len_begin_%=: \n\t"
+    "L_SHA256_transform_neon_len_begin_%=:\n\t"
         /* Load W */
         "vld1.8	{d0-d3}, [%[data]]!\n\t"
         "vld1.8	{d4-d7}, [%[data]]!\n\t"
@@ -2684,7 +2682,7 @@ WC_OMIT_FRAME_POINTER void Transform_Sha256_Len_neon(wc_Sha256* sha256,
         "mov	lr, #3\n\t"
         /* Start of 16 rounds */
         "\n"
-    "L_SHA256_transform_neon_len_start_%=: \n\t"
+    "L_SHA256_transform_neon_len_start_%=:\n\t"
         /* Round 0 */
         "vmov.32	r10, d0[0]\n\t"
         "ror	%[sha256], r6, #6\n\t"
@@ -3663,7 +3661,7 @@ WC_OMIT_FRAME_POINTER void Transform_Sha256_Len_neon(wc_Sha256* sha256,
 }
 
 #else
-static const word32 L_SHA256_trans_crypto_len_k[] = {
+XALIGNED(8) static const word32 L_SHA256_trans_crypto_len_k[] = {
     0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
     0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
     0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
@@ -3690,13 +3688,13 @@ WC_OMIT_FRAME_POINTER void Transform_Sha256_Len_crypto(wc_Sha256* sha256_p,
 #else
 WC_OMIT_FRAME_POINTER void Transform_Sha256_Len_crypto(wc_Sha256* sha256,
     const byte* data, word32 len)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register wc_Sha256* sha256 asm ("r0") = (wc_Sha256*)sha256_p;
-    register const byte* data asm ("r1") = (const byte*)data_p;
-    register word32 len asm ("r2") = (word32)len_p;
-    register word32* L_SHA256_trans_crypto_len_k_c asm ("r3") =
+    register wc_Sha256* sha256 __asm__ ("r0") = (wc_Sha256*)sha256_p;
+    register const byte* data __asm__ ("r1") = (const byte*)data_p;
+    register word32 len __asm__ ("r2") = (word32)len_p;
+    register word32* L_SHA256_trans_crypto_len_k_c __asm__ ("r3") =
         (word32*)&L_SHA256_trans_crypto_len_k;
 #else
     register word32* L_SHA256_trans_crypto_len_k_c =
@@ -3712,7 +3710,7 @@ WC_OMIT_FRAME_POINTER void Transform_Sha256_Len_crypto(wc_Sha256* sha256,
         "vldm	%[sha256], {q0-q1}\n\t"
         /* Start of loop processing a block */
         "\n"
-    "L_sha256_len_crypto_begin_%=: \n\t"
+    "L_sha256_len_crypto_begin_%=:\n\t"
         /* Load W */
         "vld1.8	{q4-q5}, [%[data]]!\n\t"
         "vld1.8	{q6-q7}, [%[data]]!\n\t"
diff --git a/wolfcrypt/src/port/arm/armv8-32-sha3-asm.S b/wolfcrypt/src/port/arm/armv8-32-sha3-asm.S
index 701adb24136..6d2f0172994 100644
--- a/wolfcrypt/src/port/arm/armv8-32-sha3-asm.S
+++ b/wolfcrypt/src/port/arm/armv8-32-sha3-asm.S
@@ -32,59 +32,32 @@
 #ifndef WOLFSSL_ARMASM_INLINE
 #ifdef WOLFSSL_SHA3
 #ifndef WOLFSSL_ARMASM_NO_NEON
+#ifndef __APPLE__
 	.text
 	.type	L_sha3_arm32_neon_rt, %object
 	.size	L_sha3_arm32_neon_rt, 192
+#else
+	.section	__DATA,__data
+#endif /* __APPLE__ */
+	# 16-byte aligned, 128-bit aligned
+#ifndef __APPLE__
 	.align	4
+#else
+	.p2align	4
+#endif /* __APPLE__ */
 L_sha3_arm32_neon_rt:
-	.word	0x1
-	.word	0x0
-	.word	0x8082
-	.word	0x0
-	.word	0x808a
-	.word	0x80000000
-	.word	0x80008000
-	.word	0x80000000
-	.word	0x808b
-	.word	0x0
-	.word	0x80000001
-	.word	0x0
-	.word	0x80008081
-	.word	0x80000000
-	.word	0x8009
-	.word	0x80000000
-	.word	0x8a
-	.word	0x0
-	.word	0x88
-	.word	0x0
-	.word	0x80008009
-	.word	0x0
-	.word	0x8000000a
-	.word	0x0
-	.word	0x8000808b
-	.word	0x0
-	.word	0x8b
-	.word	0x80000000
-	.word	0x8089
-	.word	0x80000000
-	.word	0x8003
-	.word	0x80000000
-	.word	0x8002
-	.word	0x80000000
-	.word	0x80
-	.word	0x80000000
-	.word	0x800a
-	.word	0x0
-	.word	0x8000000a
-	.word	0x80000000
-	.word	0x80008081
-	.word	0x80000000
-	.word	0x8080
-	.word	0x80000000
-	.word	0x80000001
-	.word	0x0
-	.word	0x80008008
-	.word	0x80000000
+	.quad	0x0000000000000001,0x0000000000008082
+	.quad	0x800000000000808a,0x8000000080008000
+	.quad	0x000000000000808b,0x0000000080000001
+	.quad	0x8000000080008081,0x8000000000008009
+	.quad	0x000000000000008a,0x0000000000000088
+	.quad	0x0000000080008009,0x000000008000000a
+	.quad	0x000000008000808b,0x800000000000008b
+	.quad	0x8000000000008089,0x8000000000008003
+	.quad	0x8000000000008002,0x8000000000000080
+	.quad	0x000000000000800a,0x800000008000000a
+	.quad	0x8000000080008081,0x8000000000008080
+	.quad	0x0000000080000001,0x8000000080008008
 	.text
 	.align	4
 	.globl	BlockSha3
@@ -125,7 +98,7 @@ L_sha3_arm32_neon_begin:
 	veor	d28, d28, d22
 	veor	d29, d29, d23
 	veor	d25, d25, d24
-	vst1.8	{d25, d26}, [r3]
+	vst1.8	{d25-d26}, [r3]
 	# Calc t[0..4] and XOR into s[i*5..i*5+4]
 	# t[0]
 	vshr.u64	d30, d27, #63
@@ -353,59 +326,32 @@ L_sha3_arm32_neon_begin:
 	.size	BlockSha3,.-BlockSha3
 #endif /* WOLFSSL_ARMASM_NO_NEON */
 #ifdef WOLFSSL_ARMASM_NO_NEON
+#ifndef __APPLE__
 	.text
 	.type	L_sha3_arm32_rt, %object
 	.size	L_sha3_arm32_rt, 192
+#else
+	.section	__DATA,__data
+#endif /* __APPLE__ */
+	# 16-byte aligned, 128-bit aligned
+#ifndef __APPLE__
 	.align	4
+#else
+	.p2align	4
+#endif /* __APPLE__ */
 L_sha3_arm32_rt:
-	.word	0x1
-	.word	0x0
-	.word	0x8082
-	.word	0x0
-	.word	0x808a
-	.word	0x80000000
-	.word	0x80008000
-	.word	0x80000000
-	.word	0x808b
-	.word	0x0
-	.word	0x80000001
-	.word	0x0
-	.word	0x80008081
-	.word	0x80000000
-	.word	0x8009
-	.word	0x80000000
-	.word	0x8a
-	.word	0x0
-	.word	0x88
-	.word	0x0
-	.word	0x80008009
-	.word	0x0
-	.word	0x8000000a
-	.word	0x0
-	.word	0x8000808b
-	.word	0x0
-	.word	0x8b
-	.word	0x80000000
-	.word	0x8089
-	.word	0x80000000
-	.word	0x8003
-	.word	0x80000000
-	.word	0x8002
-	.word	0x80000000
-	.word	0x80
-	.word	0x80000000
-	.word	0x800a
-	.word	0x0
-	.word	0x8000000a
-	.word	0x80000000
-	.word	0x80008081
-	.word	0x80000000
-	.word	0x8080
-	.word	0x80000000
-	.word	0x80000001
-	.word	0x0
-	.word	0x80008008
-	.word	0x80000000
+	.quad	0x0000000000000001,0x0000000000008082
+	.quad	0x800000000000808a,0x8000000080008000
+	.quad	0x000000000000808b,0x0000000080000001
+	.quad	0x8000000080008081,0x8000000000008009
+	.quad	0x000000000000008a,0x0000000000000088
+	.quad	0x0000000080008009,0x000000008000000a
+	.quad	0x000000008000808b,0x800000000000008b
+	.quad	0x8000000000008089,0x8000000000008003
+	.quad	0x8000000000008002,0x8000000000000080
+	.quad	0x000000000000800a,0x800000008000000a
+	.quad	0x8000000080008081,0x8000000000008080
+	.quad	0x0000000080000001,0x8000000080008008
 	.text
 	.align	4
 	.globl	BlockSha3
diff --git a/wolfcrypt/src/port/arm/armv8-32-sha3-asm_c.c b/wolfcrypt/src/port/arm/armv8-32-sha3-asm_c.c
index dd191f26e1f..4889b239f0b 100644
--- a/wolfcrypt/src/port/arm/armv8-32-sha3-asm_c.c
+++ b/wolfcrypt/src/port/arm/armv8-32-sha3-asm_c.c
@@ -30,8 +30,6 @@
 
 #ifdef WOLFSSL_ARMASM
 #if !defined(__aarch64__) && !defined(WOLFSSL_ARMASM_THUMB2)
-#include <stdint.h>
-#include <wolfssl/wolfcrypt/libwolfssl_sources.h>
 #ifdef WOLFSSL_ARMASM_INLINE
 
 #ifdef __IAR_SYSTEMS_ICC__
@@ -51,7 +49,7 @@
 
 #ifdef WOLFSSL_SHA3
 #ifndef WOLFSSL_ARMASM_NO_NEON
-static const word64 L_sha3_arm32_neon_rt[] = {
+XALIGNED(16) static const word64 L_sha3_arm32_neon_rt[] = {
     0x0000000000000001UL, 0x0000000000008082UL,
     0x800000000000808aUL, 0x8000000080008000UL,
     0x000000000000808bUL, 0x0000000080000001UL,
@@ -72,11 +70,11 @@ static const word64 L_sha3_arm32_neon_rt[] = {
 WC_OMIT_FRAME_POINTER void BlockSha3(word64* state_p)
 #else
 WC_OMIT_FRAME_POINTER void BlockSha3(word64* state)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register word64* state asm ("r0") = (word64*)state_p;
-    register word64* L_sha3_arm32_neon_rt_c asm ("r1") =
+    register word64* state __asm__ ("r0") = (word64*)state_p;
+    register word64* L_sha3_arm32_neon_rt_c __asm__ ("r1") =
         (word64*)&L_sha3_arm32_neon_rt;
 #else
     register word64* L_sha3_arm32_neon_rt_c = (word64*)&L_sha3_arm32_neon_rt;
@@ -96,7 +94,7 @@ WC_OMIT_FRAME_POINTER void BlockSha3(word64* state)
         "vld1.8	{d24}, [%[state]]\n\t"
         "sub	%[state], %[state], #0xc0\n\t"
         "\n"
-    "L_sha3_arm32_neon_begin_%=: \n\t"
+    "L_sha3_arm32_neon_begin_%=:\n\t"
         /* Calc b[0..4] */
         "veor	d26, d0, d5\n\t"
         "veor	d27, d1, d6\n\t"
@@ -359,7 +357,7 @@ WC_OMIT_FRAME_POINTER void BlockSha3(word64* state)
 
 #endif /* WOLFSSL_ARMASM_NO_NEON */
 #ifdef WOLFSSL_ARMASM_NO_NEON
-static const word64 L_sha3_arm32_rt[] = {
+XALIGNED(16) static const word64 L_sha3_arm32_rt[] = {
     0x0000000000000001UL, 0x0000000000008082UL,
     0x800000000000808aUL, 0x8000000080008000UL,
     0x000000000000808bUL, 0x0000000080000001UL,
@@ -380,11 +378,12 @@ static const word64 L_sha3_arm32_rt[] = {
 WC_OMIT_FRAME_POINTER void BlockSha3(word64* state_p)
 #else
 WC_OMIT_FRAME_POINTER void BlockSha3(word64* state)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register word64* state asm ("r0") = (word64*)state_p;
-    register word64* L_sha3_arm32_rt_c asm ("r1") = (word64*)&L_sha3_arm32_rt;
+    register word64* state __asm__ ("r0") = (word64*)state_p;
+    register word64* L_sha3_arm32_rt_c __asm__ ("r1") =
+        (word64*)&L_sha3_arm32_rt;
 #else
     register word64* L_sha3_arm32_rt_c = (word64*)&L_sha3_arm32_rt;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
@@ -394,7 +393,7 @@ WC_OMIT_FRAME_POINTER void BlockSha3(word64* state)
         "mov	r1, %[L_sha3_arm32_rt]\n\t"
         "mov	r2, #12\n\t"
         "\n"
-    "L_sha3_arm32_begin_%=: \n\t"
+    "L_sha3_arm32_begin_%=:\n\t"
         "str	r2, [sp, #200]\n\t"
         /* Round even */
         /* Calc b[4] */
diff --git a/wolfcrypt/src/port/arm/armv8-32-sha512-asm.S b/wolfcrypt/src/port/arm/armv8-32-sha512-asm.S
index 9624aa8b49d..60a7d731362 100644
--- a/wolfcrypt/src/port/arm/armv8-32-sha512-asm.S
+++ b/wolfcrypt/src/port/arm/armv8-32-sha512-asm.S
@@ -32,171 +32,60 @@
 #ifndef WOLFSSL_ARMASM_INLINE
 #if defined(WOLFSSL_SHA512) || defined(WOLFSSL_SHA384)
 #ifdef WOLFSSL_ARMASM_NO_NEON
+#ifndef __APPLE__
 	.text
 	.type	L_SHA512_transform_len_k, %object
 	.size	L_SHA512_transform_len_k, 640
+#else
+	.section	__DATA,__data
+#endif /* __APPLE__ */
+	# 16-byte aligned, 128-bit aligned
+#ifndef __APPLE__
 	.align	4
+#else
+	.p2align	4
+#endif /* __APPLE__ */
 L_SHA512_transform_len_k:
-	.word	0xd728ae22
-	.word	0x428a2f98
-	.word	0x23ef65cd
-	.word	0x71374491
-	.word	0xec4d3b2f
-	.word	0xb5c0fbcf
-	.word	0x8189dbbc
-	.word	0xe9b5dba5
-	.word	0xf348b538
-	.word	0x3956c25b
-	.word	0xb605d019
-	.word	0x59f111f1
-	.word	0xaf194f9b
-	.word	0x923f82a4
-	.word	0xda6d8118
-	.word	0xab1c5ed5
-	.word	0xa3030242
-	.word	0xd807aa98
-	.word	0x45706fbe
-	.word	0x12835b01
-	.word	0x4ee4b28c
-	.word	0x243185be
-	.word	0xd5ffb4e2
-	.word	0x550c7dc3
-	.word	0xf27b896f
-	.word	0x72be5d74
-	.word	0x3b1696b1
-	.word	0x80deb1fe
-	.word	0x25c71235
-	.word	0x9bdc06a7
-	.word	0xcf692694
-	.word	0xc19bf174
-	.word	0x9ef14ad2
-	.word	0xe49b69c1
-	.word	0x384f25e3
-	.word	0xefbe4786
-	.word	0x8b8cd5b5
-	.word	0xfc19dc6
-	.word	0x77ac9c65
-	.word	0x240ca1cc
-	.word	0x592b0275
-	.word	0x2de92c6f
-	.word	0x6ea6e483
-	.word	0x4a7484aa
-	.word	0xbd41fbd4
-	.word	0x5cb0a9dc
-	.word	0x831153b5
-	.word	0x76f988da
-	.word	0xee66dfab
-	.word	0x983e5152
-	.word	0x2db43210
-	.word	0xa831c66d
-	.word	0x98fb213f
-	.word	0xb00327c8
-	.word	0xbeef0ee4
-	.word	0xbf597fc7
-	.word	0x3da88fc2
-	.word	0xc6e00bf3
-	.word	0x930aa725
-	.word	0xd5a79147
-	.word	0xe003826f
-	.word	0x6ca6351
-	.word	0xa0e6e70
-	.word	0x14292967
-	.word	0x46d22ffc
-	.word	0x27b70a85
-	.word	0x5c26c926
-	.word	0x2e1b2138
-	.word	0x5ac42aed
-	.word	0x4d2c6dfc
-	.word	0x9d95b3df
-	.word	0x53380d13
-	.word	0x8baf63de
-	.word	0x650a7354
-	.word	0x3c77b2a8
-	.word	0x766a0abb
-	.word	0x47edaee6
-	.word	0x81c2c92e
-	.word	0x1482353b
-	.word	0x92722c85
-	.word	0x4cf10364
-	.word	0xa2bfe8a1
-	.word	0xbc423001
-	.word	0xa81a664b
-	.word	0xd0f89791
-	.word	0xc24b8b70
-	.word	0x654be30
-	.word	0xc76c51a3
-	.word	0xd6ef5218
-	.word	0xd192e819
-	.word	0x5565a910
-	.word	0xd6990624
-	.word	0x5771202a
-	.word	0xf40e3585
-	.word	0x32bbd1b8
-	.word	0x106aa070
-	.word	0xb8d2d0c8
-	.word	0x19a4c116
-	.word	0x5141ab53
-	.word	0x1e376c08
-	.word	0xdf8eeb99
-	.word	0x2748774c
-	.word	0xe19b48a8
-	.word	0x34b0bcb5
-	.word	0xc5c95a63
-	.word	0x391c0cb3
-	.word	0xe3418acb
-	.word	0x4ed8aa4a
-	.word	0x7763e373
-	.word	0x5b9cca4f
-	.word	0xd6b2b8a3
-	.word	0x682e6ff3
-	.word	0x5defb2fc
-	.word	0x748f82ee
-	.word	0x43172f60
-	.word	0x78a5636f
-	.word	0xa1f0ab72
-	.word	0x84c87814
-	.word	0x1a6439ec
-	.word	0x8cc70208
-	.word	0x23631e28
-	.word	0x90befffa
-	.word	0xde82bde9
-	.word	0xa4506ceb
-	.word	0xb2c67915
-	.word	0xbef9a3f7
-	.word	0xe372532b
-	.word	0xc67178f2
-	.word	0xea26619c
-	.word	0xca273ece
-	.word	0x21c0c207
-	.word	0xd186b8c7
-	.word	0xcde0eb1e
-	.word	0xeada7dd6
-	.word	0xee6ed178
-	.word	0xf57d4f7f
-	.word	0x72176fba
-	.word	0x6f067aa
-	.word	0xa2c898a6
-	.word	0xa637dc5
-	.word	0xbef90dae
-	.word	0x113f9804
-	.word	0x131c471b
-	.word	0x1b710b35
-	.word	0x23047d84
-	.word	0x28db77f5
-	.word	0x40c72493
-	.word	0x32caab7b
-	.word	0x15c9bebc
-	.word	0x3c9ebe0a
-	.word	0x9c100d4c
-	.word	0x431d67c4
-	.word	0xcb3e42b6
-	.word	0x4cc5d4be
-	.word	0xfc657e2a
-	.word	0x597f299c
-	.word	0x3ad6faec
-	.word	0x5fcb6fab
-	.word	0x4a475817
-	.word	0x6c44198c
+	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
+	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+	.quad	0x3956c25bf348b538,0x59f111f1b605d019
+	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
+	.quad	0xd807aa98a3030242,0x12835b0145706fbe
+	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
+	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
+	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
+	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
+	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
+	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
+	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
+	.quad	0x06ca6351e003826f,0x142929670a0e6e70
+	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
+	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
+	.quad	0x81c2c92e47edaee6,0x92722c851482353b
+	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
+	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
+	.quad	0xd192e819d6ef5218,0xd69906245565a910
+	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
+	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
+	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
+	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
+	.quad	0x90befffa23631e28,0xa4506cebde82bde9
+	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
+	.quad	0xca273eceea26619c,0xd186b8c721c0c207
+	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
+	.quad	0x113f9804bef90dae,0x1b710b35131c471b
+	.quad	0x28db77f523047d84,0x32caab7b40c72493
+	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
 	.text
 	.align	4
 	.globl	Transform_Sha512_Len_base
@@ -7618,171 +7507,60 @@ L_SHA512_transform_len_start:
 	.size	Transform_Sha512_Len_base,.-Transform_Sha512_Len_base
 #endif /* WOLFSSL_ARMASM_NO_NEON */
 #ifndef WOLFSSL_ARMASM_NO_NEON
+#ifndef __APPLE__
 	.text
 	.type	L_SHA512_transform_neon_len_k, %object
 	.size	L_SHA512_transform_neon_len_k, 640
+#else
+	.section	__DATA,__data
+#endif /* __APPLE__ */
+	# 16-byte aligned, 128-bit aligned
+#ifndef __APPLE__
 	.align	4
+#else
+	.p2align	4
+#endif /* __APPLE__ */
 L_SHA512_transform_neon_len_k:
-	.word	0xd728ae22
-	.word	0x428a2f98
-	.word	0x23ef65cd
-	.word	0x71374491
-	.word	0xec4d3b2f
-	.word	0xb5c0fbcf
-	.word	0x8189dbbc
-	.word	0xe9b5dba5
-	.word	0xf348b538
-	.word	0x3956c25b
-	.word	0xb605d019
-	.word	0x59f111f1
-	.word	0xaf194f9b
-	.word	0x923f82a4
-	.word	0xda6d8118
-	.word	0xab1c5ed5
-	.word	0xa3030242
-	.word	0xd807aa98
-	.word	0x45706fbe
-	.word	0x12835b01
-	.word	0x4ee4b28c
-	.word	0x243185be
-	.word	0xd5ffb4e2
-	.word	0x550c7dc3
-	.word	0xf27b896f
-	.word	0x72be5d74
-	.word	0x3b1696b1
-	.word	0x80deb1fe
-	.word	0x25c71235
-	.word	0x9bdc06a7
-	.word	0xcf692694
-	.word	0xc19bf174
-	.word	0x9ef14ad2
-	.word	0xe49b69c1
-	.word	0x384f25e3
-	.word	0xefbe4786
-	.word	0x8b8cd5b5
-	.word	0xfc19dc6
-	.word	0x77ac9c65
-	.word	0x240ca1cc
-	.word	0x592b0275
-	.word	0x2de92c6f
-	.word	0x6ea6e483
-	.word	0x4a7484aa
-	.word	0xbd41fbd4
-	.word	0x5cb0a9dc
-	.word	0x831153b5
-	.word	0x76f988da
-	.word	0xee66dfab
-	.word	0x983e5152
-	.word	0x2db43210
-	.word	0xa831c66d
-	.word	0x98fb213f
-	.word	0xb00327c8
-	.word	0xbeef0ee4
-	.word	0xbf597fc7
-	.word	0x3da88fc2
-	.word	0xc6e00bf3
-	.word	0x930aa725
-	.word	0xd5a79147
-	.word	0xe003826f
-	.word	0x6ca6351
-	.word	0xa0e6e70
-	.word	0x14292967
-	.word	0x46d22ffc
-	.word	0x27b70a85
-	.word	0x5c26c926
-	.word	0x2e1b2138
-	.word	0x5ac42aed
-	.word	0x4d2c6dfc
-	.word	0x9d95b3df
-	.word	0x53380d13
-	.word	0x8baf63de
-	.word	0x650a7354
-	.word	0x3c77b2a8
-	.word	0x766a0abb
-	.word	0x47edaee6
-	.word	0x81c2c92e
-	.word	0x1482353b
-	.word	0x92722c85
-	.word	0x4cf10364
-	.word	0xa2bfe8a1
-	.word	0xbc423001
-	.word	0xa81a664b
-	.word	0xd0f89791
-	.word	0xc24b8b70
-	.word	0x654be30
-	.word	0xc76c51a3
-	.word	0xd6ef5218
-	.word	0xd192e819
-	.word	0x5565a910
-	.word	0xd6990624
-	.word	0x5771202a
-	.word	0xf40e3585
-	.word	0x32bbd1b8
-	.word	0x106aa070
-	.word	0xb8d2d0c8
-	.word	0x19a4c116
-	.word	0x5141ab53
-	.word	0x1e376c08
-	.word	0xdf8eeb99
-	.word	0x2748774c
-	.word	0xe19b48a8
-	.word	0x34b0bcb5
-	.word	0xc5c95a63
-	.word	0x391c0cb3
-	.word	0xe3418acb
-	.word	0x4ed8aa4a
-	.word	0x7763e373
-	.word	0x5b9cca4f
-	.word	0xd6b2b8a3
-	.word	0x682e6ff3
-	.word	0x5defb2fc
-	.word	0x748f82ee
-	.word	0x43172f60
-	.word	0x78a5636f
-	.word	0xa1f0ab72
-	.word	0x84c87814
-	.word	0x1a6439ec
-	.word	0x8cc70208
-	.word	0x23631e28
-	.word	0x90befffa
-	.word	0xde82bde9
-	.word	0xa4506ceb
-	.word	0xb2c67915
-	.word	0xbef9a3f7
-	.word	0xe372532b
-	.word	0xc67178f2
-	.word	0xea26619c
-	.word	0xca273ece
-	.word	0x21c0c207
-	.word	0xd186b8c7
-	.word	0xcde0eb1e
-	.word	0xeada7dd6
-	.word	0xee6ed178
-	.word	0xf57d4f7f
-	.word	0x72176fba
-	.word	0x6f067aa
-	.word	0xa2c898a6
-	.word	0xa637dc5
-	.word	0xbef90dae
-	.word	0x113f9804
-	.word	0x131c471b
-	.word	0x1b710b35
-	.word	0x23047d84
-	.word	0x28db77f5
-	.word	0x40c72493
-	.word	0x32caab7b
-	.word	0x15c9bebc
-	.word	0x3c9ebe0a
-	.word	0x9c100d4c
-	.word	0x431d67c4
-	.word	0xcb3e42b6
-	.word	0x4cc5d4be
-	.word	0xfc657e2a
-	.word	0x597f299c
-	.word	0x3ad6faec
-	.word	0x5fcb6fab
-	.word	0x4a475817
-	.word	0x6c44198c
+	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
+	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+	.quad	0x3956c25bf348b538,0x59f111f1b605d019
+	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
+	.quad	0xd807aa98a3030242,0x12835b0145706fbe
+	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
+	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
+	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
+	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
+	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
+	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
+	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
+	.quad	0x06ca6351e003826f,0x142929670a0e6e70
+	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
+	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
+	.quad	0x81c2c92e47edaee6,0x92722c851482353b
+	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
+	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
+	.quad	0xd192e819d6ef5218,0xd69906245565a910
+	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
+	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
+	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
+	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
+	.quad	0x90befffa23631e28,0xa4506cebde82bde9
+	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
+	.quad	0xca273eceea26619c,0xd186b8c721c0c207
+	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
+	.quad	0x113f9804bef90dae,0x1b710b35131c471b
+	.quad	0x28db77f523047d84,0x32caab7b40c72493
+	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
 	.text
 	.align	4
 	.fpu	neon
@@ -7796,10 +7574,10 @@ Transform_Sha512_Len_neon:
 	# Start of loop processing a block
 L_SHA512_transform_neon_len_begin:
 	# Load W
-	vld1.8	{q8, q9}, [r1]!
-	vld1.8	{q10, q11}, [r1]!
-	vld1.8	{q12, q13}, [r1]!
-	vld1.8	{q14, q15}, [r1]!
+	vld1.8	{q8-q9}, [r1]!
+	vld1.8	{q10-q11}, [r1]!
+	vld1.8	{q12-q13}, [r1]!
+	vld1.8	{q14-q15}, [r1]!
 #ifndef WOLFSSL_ARM_ARCH_NEON_64BIT
 	vrev64.8	q8, q8
 	vrev64.8	q9, q9
diff --git a/wolfcrypt/src/port/arm/armv8-32-sha512-asm_c.c b/wolfcrypt/src/port/arm/armv8-32-sha512-asm_c.c
index 6be8ccb8cd1..48710dc3cf6 100644
--- a/wolfcrypt/src/port/arm/armv8-32-sha512-asm_c.c
+++ b/wolfcrypt/src/port/arm/armv8-32-sha512-asm_c.c
@@ -30,8 +30,6 @@
 
 #ifdef WOLFSSL_ARMASM
 #if !defined(__aarch64__) && !defined(WOLFSSL_ARMASM_THUMB2)
-#include <stdint.h>
-#include <wolfssl/wolfcrypt/libwolfssl_sources.h>
 #ifdef WOLFSSL_ARMASM_INLINE
 
 #ifdef __IAR_SYSTEMS_ICC__
@@ -53,7 +51,7 @@
 #include <wolfssl/wolfcrypt/sha512.h>
 
 #ifdef WOLFSSL_ARMASM_NO_NEON
-static const word64 L_SHA512_transform_len_k[] = {
+XALIGNED(16) static const word64 L_SHA512_transform_len_k[] = {
     0x428a2f98d728ae22UL, 0x7137449123ef65cdUL,
     0xb5c0fbcfec4d3b2fUL, 0xe9b5dba58189dbbcUL,
     0x3956c25bf348b538UL, 0x59f111f1b605d019UL,
@@ -104,13 +102,13 @@ WC_OMIT_FRAME_POINTER void Transform_Sha512_Len_base(wc_Sha512* sha512_p,
 #else
 WC_OMIT_FRAME_POINTER void Transform_Sha512_Len_base(wc_Sha512* sha512,
     const byte* data, word32 len)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register wc_Sha512* sha512 asm ("r0") = (wc_Sha512*)sha512_p;
-    register const byte* data asm ("r1") = (const byte*)data_p;
-    register word32 len asm ("r2") = (word32)len_p;
-    register word64* L_SHA512_transform_len_k_c asm ("r3") =
+    register wc_Sha512* sha512 __asm__ ("r0") = (wc_Sha512*)sha512_p;
+    register const byte* data __asm__ ("r1") = (const byte*)data_p;
+    register word32 len __asm__ ("r2") = (word32)len_p;
+    register word64* L_SHA512_transform_len_k_c __asm__ ("r3") =
         (word64*)&L_SHA512_transform_len_k;
 #else
     register word64* L_SHA512_transform_len_k_c =
@@ -218,7 +216,7 @@ WC_OMIT_FRAME_POINTER void Transform_Sha512_Len_base(wc_Sha512* sha512,
 #endif
         /* Start of loop processing a block */
         "\n"
-    "L_SHA512_transform_len_begin_%=: \n\t"
+    "L_SHA512_transform_len_begin_%=:\n\t"
         /* Load, Reverse and Store W - 64 bytes */
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6)
         "ldr	r4, [%[data]]\n\t"
@@ -529,7 +527,7 @@ WC_OMIT_FRAME_POINTER void Transform_Sha512_Len_base(wc_Sha512* sha512,
         "mov	r12, #4\n\t"
         /* Start of 16 rounds */
         "\n"
-    "L_SHA512_transform_len_start_%=: \n\t"
+    "L_SHA512_transform_len_start_%=:\n\t"
         /* Round 0 */
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
         "ldr	r4, [%[sha512], #32]\n\t"
@@ -7549,7 +7547,7 @@ WC_OMIT_FRAME_POINTER void Transform_Sha512_Len_base(wc_Sha512* sha512,
 #include <wolfssl/wolfcrypt/sha512.h>
 
 #ifndef WOLFSSL_ARMASM_NO_NEON
-static const word64 L_SHA512_transform_neon_len_k[] = {
+XALIGNED(16) static const word64 L_SHA512_transform_neon_len_k[] = {
     0x428a2f98d728ae22UL, 0x7137449123ef65cdUL,
     0xb5c0fbcfec4d3b2fUL, 0xe9b5dba58189dbbcUL,
     0x3956c25bf348b538UL, 0x59f111f1b605d019UL,
@@ -7600,13 +7598,13 @@ WC_OMIT_FRAME_POINTER void Transform_Sha512_Len_neon(wc_Sha512* sha512_p,
 #else
 WC_OMIT_FRAME_POINTER void Transform_Sha512_Len_neon(wc_Sha512* sha512,
     const byte* data, word32 len)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register wc_Sha512* sha512 asm ("r0") = (wc_Sha512*)sha512_p;
-    register const byte* data asm ("r1") = (const byte*)data_p;
-    register word32 len asm ("r2") = (word32)len_p;
-    register word64* L_SHA512_transform_neon_len_k_c asm ("r3") =
+    register wc_Sha512* sha512 __asm__ ("r0") = (wc_Sha512*)sha512_p;
+    register const byte* data __asm__ ("r1") = (const byte*)data_p;
+    register word32 len __asm__ ("r2") = (word32)len_p;
+    register word64* L_SHA512_transform_neon_len_k_c __asm__ ("r3") =
         (word64*)&L_SHA512_transform_neon_len_k;
 #else
     register word64* L_SHA512_transform_neon_len_k_c =
@@ -7619,7 +7617,7 @@ WC_OMIT_FRAME_POINTER void Transform_Sha512_Len_neon(wc_Sha512* sha512,
         "vldm.64	%[sha512], {d0-d7}\n\t"
         /* Start of loop processing a block */
         "\n"
-    "L_SHA512_transform_neon_len_begin_%=: \n\t"
+    "L_SHA512_transform_neon_len_begin_%=:\n\t"
         /* Load W */
         "vld1.8	{q8-q9}, [%[data]]!\n\t"
         "vld1.8	{q10-q11}, [%[data]]!\n\t"
@@ -7655,7 +7653,7 @@ WC_OMIT_FRAME_POINTER void Transform_Sha512_Len_neon(wc_Sha512* sha512,
         "mov	r12, #4\n\t"
         /* Start of 16 rounds */
         "\n"
-    "L_SHA512_transform_neon_len_start_%=: \n\t"
+    "L_SHA512_transform_neon_len_start_%=:\n\t"
         /* Round 0 */
         "vld1.64	{d12}, [r3:64]!\n\t"
         "vshl.u64	d8, d4, #50\n\t"
diff --git a/wolfcrypt/src/port/arm/armv8-aes-asm.S b/wolfcrypt/src/port/arm/armv8-aes-asm.S
index cf46a0790ba..fa48e67b178 100644
--- a/wolfcrypt/src/port/arm/armv8-aes-asm.S
+++ b/wolfcrypt/src/port/arm/armv8-aes-asm.S
@@ -43123,16 +43123,17 @@ L_aes_xts_decrypt_arm64_crypto_done:
     defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER)
 #ifndef __APPLE__
 	.text
-	.type	L_AES_ARM64_NEON_te, %object
 	.section	.rodata
+	.type	L_AES_ARM64_NEON_te, %object
 	.size	L_AES_ARM64_NEON_te, 256
 #else
 	.section	__DATA,__data
 #endif /* __APPLE__ */
+	# 8-byte aligned, 64-bit aligned
 #ifndef __APPLE__
-	.align	1
+	.align	3
 #else
-	.p2align	1
+	.p2align	3
 #endif /* __APPLE__ */
 L_AES_ARM64_NEON_te:
 	.byte	0x63,0x7c,0x77,0x7b,0xf2,0x6b,0x6f,0xc5
@@ -43169,16 +43170,17 @@ L_AES_ARM64_NEON_te:
 	.byte	0x41,0x99,0x2d,0x0f,0xb0,0x54,0xbb,0x16
 #ifndef __APPLE__
 	.text
-	.type	L_AES_ARM64_NEON_shift_rows_shuffle, %object
 	.section	.rodata
+	.type	L_AES_ARM64_NEON_shift_rows_shuffle, %object
 	.size	L_AES_ARM64_NEON_shift_rows_shuffle, 16
 #else
 	.section	__DATA,__data
 #endif /* __APPLE__ */
+	# 8-byte aligned, 64-bit aligned
 #ifndef __APPLE__
-	.align	1
+	.align	3
 #else
-	.p2align	1
+	.p2align	3
 #endif /* __APPLE__ */
 L_AES_ARM64_NEON_shift_rows_shuffle:
 	.byte	0x0c,0x09,0x06,0x03,0x00,0x0d,0x0a,0x07
@@ -43249,28 +43251,22 @@ L_AES_invert_key_NEON_mix_loop:
 #endif /* HAVE_AES_DECRYPT */
 #ifndef __APPLE__
 	.text
-	.type	L_AES_ARM64_NEON_rcon, %object
 	.section	.rodata
+	.type	L_AES_ARM64_NEON_rcon, %object
 	.size	L_AES_ARM64_NEON_rcon, 40
 #else
 	.section	__DATA,__data
 #endif /* __APPLE__ */
+	# 8-byte aligned, 64-bit aligned
 #ifndef __APPLE__
 	.align	3
 #else
 	.p2align	3
 #endif /* __APPLE__ */
 L_AES_ARM64_NEON_rcon:
-	.word	0x01000000
-	.word	0x02000000
-	.word	0x04000000
-	.word	0x08000000
-	.word	0x10000000
-	.word	0x20000000
-	.word	0x40000000
-	.word	0x80000000
-	.word	0x1b000000
-	.word	0x36000000
+	.long	0x01000000,0x02000000,0x04000000,0x08000000
+	.long	0x10000000,0x20000000,0x40000000,0x80000000
+	.long	0x1b000000,0x36000000
 #ifndef __APPLE__
 .text
 .globl	AES_set_encrypt_key_NEON
@@ -45307,16 +45303,17 @@ L_AES_CTR_encrypt_NEON_data_done:
     defined(HAVE_AES_CBC) || defined(HAVE_AES_ECB)
 #ifndef __APPLE__
 	.text
-	.type	L_AES_ARM64_NEON_td, %object
 	.section	.rodata
+	.type	L_AES_ARM64_NEON_td, %object
 	.size	L_AES_ARM64_NEON_td, 256
 #else
 	.section	__DATA,__data
 #endif /* __APPLE__ */
+	# 8-byte aligned, 64-bit aligned
 #ifndef __APPLE__
-	.align	1
+	.align	3
 #else
-	.p2align	1
+	.p2align	3
 #endif /* __APPLE__ */
 L_AES_ARM64_NEON_td:
 	.byte	0x52,0x09,0x6a,0xd5,0x30,0x36,0xa5,0x38
@@ -45353,16 +45350,17 @@ L_AES_ARM64_NEON_td:
 	.byte	0xe1,0x69,0x14,0x63,0x55,0x21,0x0c,0x7d
 #ifndef __APPLE__
 	.text
-	.type	L_AES_ARM64_NEON_shift_rows_invshuffle, %object
 	.section	.rodata
+	.type	L_AES_ARM64_NEON_shift_rows_invshuffle, %object
 	.size	L_AES_ARM64_NEON_shift_rows_invshuffle, 16
 #else
 	.section	__DATA,__data
 #endif /* __APPLE__ */
+	# 8-byte aligned, 64-bit aligned
 #ifndef __APPLE__
-	.align	1
+	.align	3
 #else
-	.p2align	1
+	.p2align	3
 #endif /* __APPLE__ */
 L_AES_ARM64_NEON_shift_rows_invshuffle:
 	.byte	0x04,0x09,0x0e,0x03,0x08,0x0d,0x02,0x07
@@ -51289,548 +51287,166 @@ L_AES_XTS_decrypt_NEON_data_done:
 #ifdef HAVE_AES_DECRYPT
 #ifndef __APPLE__
 	.text
-	.type	L_AES_ARM64_td, %object
 	.section	.rodata
+	.type	L_AES_ARM64_td, %object
 	.size	L_AES_ARM64_td, 1024
 #else
 	.section	__DATA,__data
 #endif /* __APPLE__ */
+	# 8-byte aligned, 64-bit aligned
 #ifndef __APPLE__
 	.align	3
 #else
 	.p2align	3
 #endif /* __APPLE__ */
 L_AES_ARM64_td:
-	.word	0x5051f4a7
-	.word	0x537e4165
-	.word	0xc31a17a4
-	.word	0x963a275e
-	.word	0xcb3bab6b
-	.word	0xf11f9d45
-	.word	0xabacfa58
-	.word	0x934be303
-	.word	0x552030fa
-	.word	0xf6ad766d
-	.word	0x9188cc76
-	.word	0x25f5024c
-	.word	0xfc4fe5d7
-	.word	0xd7c52acb
-	.word	0x80263544
-	.word	0x8fb562a3
-	.word	0x49deb15a
-	.word	0x6725ba1b
-	.word	0x9845ea0e
-	.word	0xe15dfec0
-	.word	0x02c32f75
-	.word	0x12814cf0
-	.word	0xa38d4697
-	.word	0xc66bd3f9
-	.word	0xe7038f5f
-	.word	0x9515929c
-	.word	0xebbf6d7a
-	.word	0xda955259
-	.word	0x2dd4be83
-	.word	0xd3587421
-	.word	0x2949e069
-	.word	0x448ec9c8
-	.word	0x6a75c289
-	.word	0x78f48e79
-	.word	0x6b99583e
-	.word	0xdd27b971
-	.word	0xb6bee14f
-	.word	0x17f088ad
-	.word	0x66c920ac
-	.word	0xb47dce3a
-	.word	0x1863df4a
-	.word	0x82e51a31
-	.word	0x60975133
-	.word	0x4562537f
-	.word	0xe0b16477
-	.word	0x84bb6bae
-	.word	0x1cfe81a0
-	.word	0x94f9082b
-	.word	0x58704868
-	.word	0x198f45fd
-	.word	0x8794de6c
-	.word	0xb7527bf8
-	.word	0x23ab73d3
-	.word	0xe2724b02
-	.word	0x57e31f8f
-	.word	0x2a6655ab
-	.word	0x07b2eb28
-	.word	0x032fb5c2
-	.word	0x9a86c57b
-	.word	0xa5d33708
-	.word	0xf2302887
-	.word	0xb223bfa5
-	.word	0xba02036a
-	.word	0x5ced1682
-	.word	0x2b8acf1c
-	.word	0x92a779b4
-	.word	0xf0f307f2
-	.word	0xa14e69e2
-	.word	0xcd65daf4
-	.word	0xd50605be
-	.word	0x1fd13462
-	.word	0x8ac4a6fe
-	.word	0x9d342e53
-	.word	0xa0a2f355
-	.word	0x32058ae1
-	.word	0x75a4f6eb
-	.word	0x390b83ec
-	.word	0xaa4060ef
-	.word	0x065e719f
-	.word	0x51bd6e10
-	.word	0xf93e218a
-	.word	0x3d96dd06
-	.word	0xaedd3e05
-	.word	0x464de6bd
-	.word	0xb591548d
-	.word	0x0571c45d
-	.word	0x6f0406d4
-	.word	0xff605015
-	.word	0x241998fb
-	.word	0x97d6bde9
-	.word	0xcc894043
-	.word	0x7767d99e
-	.word	0xbdb0e842
-	.word	0x8807898b
-	.word	0x38e7195b
-	.word	0xdb79c8ee
-	.word	0x47a17c0a
-	.word	0xe97c420f
-	.word	0xc9f8841e
-	.word	0x00000000
-	.word	0x83098086
-	.word	0x48322bed
-	.word	0xac1e1170
-	.word	0x4e6c5a72
-	.word	0xfbfd0eff
-	.word	0x560f8538
-	.word	0x1e3daed5
-	.word	0x27362d39
-	.word	0x640a0fd9
-	.word	0x21685ca6
-	.word	0xd19b5b54
-	.word	0x3a24362e
-	.word	0xb10c0a67
-	.word	0x0f9357e7
-	.word	0xd2b4ee96
-	.word	0x9e1b9b91
-	.word	0x4f80c0c5
-	.word	0xa261dc20
-	.word	0x695a774b
-	.word	0x161c121a
-	.word	0x0ae293ba
-	.word	0xe5c0a02a
-	.word	0x433c22e0
-	.word	0x1d121b17
-	.word	0x0b0e090d
-	.word	0xadf28bc7
-	.word	0xb92db6a8
-	.word	0xc8141ea9
-	.word	0x8557f119
-	.word	0x4caf7507
-	.word	0xbbee99dd
-	.word	0xfda37f60
-	.word	0x9ff70126
-	.word	0xbc5c72f5
-	.word	0xc544663b
-	.word	0x345bfb7e
-	.word	0x768b4329
-	.word	0xdccb23c6
-	.word	0x68b6edfc
-	.word	0x63b8e4f1
-	.word	0xcad731dc
-	.word	0x10426385
-	.word	0x40139722
-	.word	0x2084c611
-	.word	0x7d854a24
-	.word	0xf8d2bb3d
-	.word	0x11aef932
-	.word	0x6dc729a1
-	.word	0x4b1d9e2f
-	.word	0xf3dcb230
-	.word	0xec0d8652
-	.word	0xd077c1e3
-	.word	0x6c2bb316
-	.word	0x99a970b9
-	.word	0xfa119448
-	.word	0x2247e964
-	.word	0xc4a8fc8c
-	.word	0x1aa0f03f
-	.word	0xd8567d2c
-	.word	0xef223390
-	.word	0xc787494e
-	.word	0xc1d938d1
-	.word	0xfe8ccaa2
-	.word	0x3698d40b
-	.word	0xcfa6f581
-	.word	0x28a57ade
-	.word	0x26dab78e
-	.word	0xa43fadbf
-	.word	0xe42c3a9d
-	.word	0x0d507892
-	.word	0x9b6a5fcc
-	.word	0x62547e46
-	.word	0xc2f68d13
-	.word	0xe890d8b8
-	.word	0x5e2e39f7
-	.word	0xf582c3af
-	.word	0xbe9f5d80
-	.word	0x7c69d093
-	.word	0xa96fd52d
-	.word	0xb3cf2512
-	.word	0x3bc8ac99
-	.word	0xa710187d
-	.word	0x6ee89c63
-	.word	0x7bdb3bbb
-	.word	0x09cd2678
-	.word	0xf46e5918
-	.word	0x01ec9ab7
-	.word	0xa8834f9a
-	.word	0x65e6956e
-	.word	0x7eaaffe6
-	.word	0x0821bccf
-	.word	0xe6ef15e8
-	.word	0xd9bae79b
-	.word	0xce4a6f36
-	.word	0xd4ea9f09
-	.word	0xd629b07c
-	.word	0xaf31a4b2
-	.word	0x312a3f23
-	.word	0x30c6a594
-	.word	0xc035a266
-	.word	0x37744ebc
-	.word	0xa6fc82ca
-	.word	0xb0e090d0
-	.word	0x1533a7d8
-	.word	0x4af10498
-	.word	0xf741ecda
-	.word	0x0e7fcd50
-	.word	0x2f1791f6
-	.word	0x8d764dd6
-	.word	0x4d43efb0
-	.word	0x54ccaa4d
-	.word	0xdfe49604
-	.word	0xe39ed1b5
-	.word	0x1b4c6a88
-	.word	0xb8c12c1f
-	.word	0x7f466551
-	.word	0x049d5eea
-	.word	0x5d018c35
-	.word	0x73fa8774
-	.word	0x2efb0b41
-	.word	0x5ab3671d
-	.word	0x5292dbd2
-	.word	0x33e91056
-	.word	0x136dd647
-	.word	0x8c9ad761
-	.word	0x7a37a10c
-	.word	0x8e59f814
-	.word	0x89eb133c
-	.word	0xeecea927
-	.word	0x35b761c9
-	.word	0xede11ce5
-	.word	0x3c7a47b1
-	.word	0x599cd2df
-	.word	0x3f55f273
-	.word	0x791814ce
-	.word	0xbf73c737
-	.word	0xea53f7cd
-	.word	0x5b5ffdaa
-	.word	0x14df3d6f
-	.word	0x867844db
-	.word	0x81caaff3
-	.word	0x3eb968c4
-	.word	0x2c382434
-	.word	0x5fc2a340
-	.word	0x72161dc3
-	.word	0x0cbce225
-	.word	0x8b283c49
-	.word	0x41ff0d95
-	.word	0x7139a801
-	.word	0xde080cb3
-	.word	0x9cd8b4e4
-	.word	0x906456c1
-	.word	0x617bcb84
-	.word	0x70d532b6
-	.word	0x74486c5c
-	.word	0x42d0b857
+	.long	0x5051f4a7,0x537e4165,0xc31a17a4,0x963a275e
+	.long	0xcb3bab6b,0xf11f9d45,0xabacfa58,0x934be303
+	.long	0x552030fa,0xf6ad766d,0x9188cc76,0x25f5024c
+	.long	0xfc4fe5d7,0xd7c52acb,0x80263544,0x8fb562a3
+	.long	0x49deb15a,0x6725ba1b,0x9845ea0e,0xe15dfec0
+	.long	0x02c32f75,0x12814cf0,0xa38d4697,0xc66bd3f9
+	.long	0xe7038f5f,0x9515929c,0xebbf6d7a,0xda955259
+	.long	0x2dd4be83,0xd3587421,0x2949e069,0x448ec9c8
+	.long	0x6a75c289,0x78f48e79,0x6b99583e,0xdd27b971
+	.long	0xb6bee14f,0x17f088ad,0x66c920ac,0xb47dce3a
+	.long	0x1863df4a,0x82e51a31,0x60975133,0x4562537f
+	.long	0xe0b16477,0x84bb6bae,0x1cfe81a0,0x94f9082b
+	.long	0x58704868,0x198f45fd,0x8794de6c,0xb7527bf8
+	.long	0x23ab73d3,0xe2724b02,0x57e31f8f,0x2a6655ab
+	.long	0x07b2eb28,0x032fb5c2,0x9a86c57b,0xa5d33708
+	.long	0xf2302887,0xb223bfa5,0xba02036a,0x5ced1682
+	.long	0x2b8acf1c,0x92a779b4,0xf0f307f2,0xa14e69e2
+	.long	0xcd65daf4,0xd50605be,0x1fd13462,0x8ac4a6fe
+	.long	0x9d342e53,0xa0a2f355,0x32058ae1,0x75a4f6eb
+	.long	0x390b83ec,0xaa4060ef,0x065e719f,0x51bd6e10
+	.long	0xf93e218a,0x3d96dd06,0xaedd3e05,0x464de6bd
+	.long	0xb591548d,0x0571c45d,0x6f0406d4,0xff605015
+	.long	0x241998fb,0x97d6bde9,0xcc894043,0x7767d99e
+	.long	0xbdb0e842,0x8807898b,0x38e7195b,0xdb79c8ee
+	.long	0x47a17c0a,0xe97c420f,0xc9f8841e,0x00000000
+	.long	0x83098086,0x48322bed,0xac1e1170,0x4e6c5a72
+	.long	0xfbfd0eff,0x560f8538,0x1e3daed5,0x27362d39
+	.long	0x640a0fd9,0x21685ca6,0xd19b5b54,0x3a24362e
+	.long	0xb10c0a67,0x0f9357e7,0xd2b4ee96,0x9e1b9b91
+	.long	0x4f80c0c5,0xa261dc20,0x695a774b,0x161c121a
+	.long	0x0ae293ba,0xe5c0a02a,0x433c22e0,0x1d121b17
+	.long	0x0b0e090d,0xadf28bc7,0xb92db6a8,0xc8141ea9
+	.long	0x8557f119,0x4caf7507,0xbbee99dd,0xfda37f60
+	.long	0x9ff70126,0xbc5c72f5,0xc544663b,0x345bfb7e
+	.long	0x768b4329,0xdccb23c6,0x68b6edfc,0x63b8e4f1
+	.long	0xcad731dc,0x10426385,0x40139722,0x2084c611
+	.long	0x7d854a24,0xf8d2bb3d,0x11aef932,0x6dc729a1
+	.long	0x4b1d9e2f,0xf3dcb230,0xec0d8652,0xd077c1e3
+	.long	0x6c2bb316,0x99a970b9,0xfa119448,0x2247e964
+	.long	0xc4a8fc8c,0x1aa0f03f,0xd8567d2c,0xef223390
+	.long	0xc787494e,0xc1d938d1,0xfe8ccaa2,0x3698d40b
+	.long	0xcfa6f581,0x28a57ade,0x26dab78e,0xa43fadbf
+	.long	0xe42c3a9d,0x0d507892,0x9b6a5fcc,0x62547e46
+	.long	0xc2f68d13,0xe890d8b8,0x5e2e39f7,0xf582c3af
+	.long	0xbe9f5d80,0x7c69d093,0xa96fd52d,0xb3cf2512
+	.long	0x3bc8ac99,0xa710187d,0x6ee89c63,0x7bdb3bbb
+	.long	0x09cd2678,0xf46e5918,0x01ec9ab7,0xa8834f9a
+	.long	0x65e6956e,0x7eaaffe6,0x0821bccf,0xe6ef15e8
+	.long	0xd9bae79b,0xce4a6f36,0xd4ea9f09,0xd629b07c
+	.long	0xaf31a4b2,0x312a3f23,0x30c6a594,0xc035a266
+	.long	0x37744ebc,0xa6fc82ca,0xb0e090d0,0x1533a7d8
+	.long	0x4af10498,0xf741ecda,0x0e7fcd50,0x2f1791f6
+	.long	0x8d764dd6,0x4d43efb0,0x54ccaa4d,0xdfe49604
+	.long	0xe39ed1b5,0x1b4c6a88,0xb8c12c1f,0x7f466551
+	.long	0x049d5eea,0x5d018c35,0x73fa8774,0x2efb0b41
+	.long	0x5ab3671d,0x5292dbd2,0x33e91056,0x136dd647
+	.long	0x8c9ad761,0x7a37a10c,0x8e59f814,0x89eb133c
+	.long	0xeecea927,0x35b761c9,0xede11ce5,0x3c7a47b1
+	.long	0x599cd2df,0x3f55f273,0x791814ce,0xbf73c737
+	.long	0xea53f7cd,0x5b5ffdaa,0x14df3d6f,0x867844db
+	.long	0x81caaff3,0x3eb968c4,0x2c382434,0x5fc2a340
+	.long	0x72161dc3,0x0cbce225,0x8b283c49,0x41ff0d95
+	.long	0x7139a801,0xde080cb3,0x9cd8b4e4,0x906456c1
+	.long	0x617bcb84,0x70d532b6,0x74486c5c,0x42d0b857
 #endif /* HAVE_AES_DECRYPT */
 #if defined(HAVE_AES_DECRYPT) || defined(HAVE_AES_CBC) || \
     defined(HAVE_AESCCM) || defined(HAVE_AESGCM) || \
     defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER)
 #ifndef __APPLE__
 	.text
-	.type	L_AES_ARM64_te, %object
 	.section	.rodata
+	.type	L_AES_ARM64_te, %object
 	.size	L_AES_ARM64_te, 1024
 #else
 	.section	__DATA,__data
 #endif /* __APPLE__ */
+	# 8-byte aligned, 64-bit aligned
 #ifndef __APPLE__
 	.align	3
 #else
 	.p2align	3
 #endif /* __APPLE__ */
 L_AES_ARM64_te:
-	.word	0xa5c66363
-	.word	0x84f87c7c
-	.word	0x99ee7777
-	.word	0x8df67b7b
-	.word	0x0dfff2f2
-	.word	0xbdd66b6b
-	.word	0xb1de6f6f
-	.word	0x5491c5c5
-	.word	0x50603030
-	.word	0x03020101
-	.word	0xa9ce6767
-	.word	0x7d562b2b
-	.word	0x19e7fefe
-	.word	0x62b5d7d7
-	.word	0xe64dabab
-	.word	0x9aec7676
-	.word	0x458fcaca
-	.word	0x9d1f8282
-	.word	0x4089c9c9
-	.word	0x87fa7d7d
-	.word	0x15effafa
-	.word	0xebb25959
-	.word	0xc98e4747
-	.word	0x0bfbf0f0
-	.word	0xec41adad
-	.word	0x67b3d4d4
-	.word	0xfd5fa2a2
-	.word	0xea45afaf
-	.word	0xbf239c9c
-	.word	0xf753a4a4
-	.word	0x96e47272
-	.word	0x5b9bc0c0
-	.word	0xc275b7b7
-	.word	0x1ce1fdfd
-	.word	0xae3d9393
-	.word	0x6a4c2626
-	.word	0x5a6c3636
-	.word	0x417e3f3f
-	.word	0x02f5f7f7
-	.word	0x4f83cccc
-	.word	0x5c683434
-	.word	0xf451a5a5
-	.word	0x34d1e5e5
-	.word	0x08f9f1f1
-	.word	0x93e27171
-	.word	0x73abd8d8
-	.word	0x53623131
-	.word	0x3f2a1515
-	.word	0x0c080404
-	.word	0x5295c7c7
-	.word	0x65462323
-	.word	0x5e9dc3c3
-	.word	0x28301818
-	.word	0xa1379696
-	.word	0x0f0a0505
-	.word	0xb52f9a9a
-	.word	0x090e0707
-	.word	0x36241212
-	.word	0x9b1b8080
-	.word	0x3ddfe2e2
-	.word	0x26cdebeb
-	.word	0x694e2727
-	.word	0xcd7fb2b2
-	.word	0x9fea7575
-	.word	0x1b120909
-	.word	0x9e1d8383
-	.word	0x74582c2c
-	.word	0x2e341a1a
-	.word	0x2d361b1b
-	.word	0xb2dc6e6e
-	.word	0xeeb45a5a
-	.word	0xfb5ba0a0
-	.word	0xf6a45252
-	.word	0x4d763b3b
-	.word	0x61b7d6d6
-	.word	0xce7db3b3
-	.word	0x7b522929
-	.word	0x3edde3e3
-	.word	0x715e2f2f
-	.word	0x97138484
-	.word	0xf5a65353
-	.word	0x68b9d1d1
-	.word	0x00000000
-	.word	0x2cc1eded
-	.word	0x60402020
-	.word	0x1fe3fcfc
-	.word	0xc879b1b1
-	.word	0xedb65b5b
-	.word	0xbed46a6a
-	.word	0x468dcbcb
-	.word	0xd967bebe
-	.word	0x4b723939
-	.word	0xde944a4a
-	.word	0xd4984c4c
-	.word	0xe8b05858
-	.word	0x4a85cfcf
-	.word	0x6bbbd0d0
-	.word	0x2ac5efef
-	.word	0xe54faaaa
-	.word	0x16edfbfb
-	.word	0xc5864343
-	.word	0xd79a4d4d
-	.word	0x55663333
-	.word	0x94118585
-	.word	0xcf8a4545
-	.word	0x10e9f9f9
-	.word	0x06040202
-	.word	0x81fe7f7f
-	.word	0xf0a05050
-	.word	0x44783c3c
-	.word	0xba259f9f
-	.word	0xe34ba8a8
-	.word	0xf3a25151
-	.word	0xfe5da3a3
-	.word	0xc0804040
-	.word	0x8a058f8f
-	.word	0xad3f9292
-	.word	0xbc219d9d
-	.word	0x48703838
-	.word	0x04f1f5f5
-	.word	0xdf63bcbc
-	.word	0xc177b6b6
-	.word	0x75afdada
-	.word	0x63422121
-	.word	0x30201010
-	.word	0x1ae5ffff
-	.word	0x0efdf3f3
-	.word	0x6dbfd2d2
-	.word	0x4c81cdcd
-	.word	0x14180c0c
-	.word	0x35261313
-	.word	0x2fc3ecec
-	.word	0xe1be5f5f
-	.word	0xa2359797
-	.word	0xcc884444
-	.word	0x392e1717
-	.word	0x5793c4c4
-	.word	0xf255a7a7
-	.word	0x82fc7e7e
-	.word	0x477a3d3d
-	.word	0xacc86464
-	.word	0xe7ba5d5d
-	.word	0x2b321919
-	.word	0x95e67373
-	.word	0xa0c06060
-	.word	0x98198181
-	.word	0xd19e4f4f
-	.word	0x7fa3dcdc
-	.word	0x66442222
-	.word	0x7e542a2a
-	.word	0xab3b9090
-	.word	0x830b8888
-	.word	0xca8c4646
-	.word	0x29c7eeee
-	.word	0xd36bb8b8
-	.word	0x3c281414
-	.word	0x79a7dede
-	.word	0xe2bc5e5e
-	.word	0x1d160b0b
-	.word	0x76addbdb
-	.word	0x3bdbe0e0
-	.word	0x56643232
-	.word	0x4e743a3a
-	.word	0x1e140a0a
-	.word	0xdb924949
-	.word	0x0a0c0606
-	.word	0x6c482424
-	.word	0xe4b85c5c
-	.word	0x5d9fc2c2
-	.word	0x6ebdd3d3
-	.word	0xef43acac
-	.word	0xa6c46262
-	.word	0xa8399191
-	.word	0xa4319595
-	.word	0x37d3e4e4
-	.word	0x8bf27979
-	.word	0x32d5e7e7
-	.word	0x438bc8c8
-	.word	0x596e3737
-	.word	0xb7da6d6d
-	.word	0x8c018d8d
-	.word	0x64b1d5d5
-	.word	0xd29c4e4e
-	.word	0xe049a9a9
-	.word	0xb4d86c6c
-	.word	0xfaac5656
-	.word	0x07f3f4f4
-	.word	0x25cfeaea
-	.word	0xafca6565
-	.word	0x8ef47a7a
-	.word	0xe947aeae
-	.word	0x18100808
-	.word	0xd56fbaba
-	.word	0x88f07878
-	.word	0x6f4a2525
-	.word	0x725c2e2e
-	.word	0x24381c1c
-	.word	0xf157a6a6
-	.word	0xc773b4b4
-	.word	0x5197c6c6
-	.word	0x23cbe8e8
-	.word	0x7ca1dddd
-	.word	0x9ce87474
-	.word	0x213e1f1f
-	.word	0xdd964b4b
-	.word	0xdc61bdbd
-	.word	0x860d8b8b
-	.word	0x850f8a8a
-	.word	0x90e07070
-	.word	0x427c3e3e
-	.word	0xc471b5b5
-	.word	0xaacc6666
-	.word	0xd8904848
-	.word	0x05060303
-	.word	0x01f7f6f6
-	.word	0x121c0e0e
-	.word	0xa3c26161
-	.word	0x5f6a3535
-	.word	0xf9ae5757
-	.word	0xd069b9b9
-	.word	0x91178686
-	.word	0x5899c1c1
-	.word	0x273a1d1d
-	.word	0xb9279e9e
-	.word	0x38d9e1e1
-	.word	0x13ebf8f8
-	.word	0xb32b9898
-	.word	0x33221111
-	.word	0xbbd26969
-	.word	0x70a9d9d9
-	.word	0x89078e8e
-	.word	0xa7339494
-	.word	0xb62d9b9b
-	.word	0x223c1e1e
-	.word	0x92158787
-	.word	0x20c9e9e9
-	.word	0x4987cece
-	.word	0xffaa5555
-	.word	0x78502828
-	.word	0x7aa5dfdf
-	.word	0x8f038c8c
-	.word	0xf859a1a1
-	.word	0x80098989
-	.word	0x171a0d0d
-	.word	0xda65bfbf
-	.word	0x31d7e6e6
-	.word	0xc6844242
-	.word	0xb8d06868
-	.word	0xc3824141
-	.word	0xb0299999
-	.word	0x775a2d2d
-	.word	0x111e0f0f
-	.word	0xcb7bb0b0
-	.word	0xfca85454
-	.word	0xd66dbbbb
-	.word	0x3a2c1616
+	.long	0xa5c66363,0x84f87c7c,0x99ee7777,0x8df67b7b
+	.long	0x0dfff2f2,0xbdd66b6b,0xb1de6f6f,0x5491c5c5
+	.long	0x50603030,0x03020101,0xa9ce6767,0x7d562b2b
+	.long	0x19e7fefe,0x62b5d7d7,0xe64dabab,0x9aec7676
+	.long	0x458fcaca,0x9d1f8282,0x4089c9c9,0x87fa7d7d
+	.long	0x15effafa,0xebb25959,0xc98e4747,0x0bfbf0f0
+	.long	0xec41adad,0x67b3d4d4,0xfd5fa2a2,0xea45afaf
+	.long	0xbf239c9c,0xf753a4a4,0x96e47272,0x5b9bc0c0
+	.long	0xc275b7b7,0x1ce1fdfd,0xae3d9393,0x6a4c2626
+	.long	0x5a6c3636,0x417e3f3f,0x02f5f7f7,0x4f83cccc
+	.long	0x5c683434,0xf451a5a5,0x34d1e5e5,0x08f9f1f1
+	.long	0x93e27171,0x73abd8d8,0x53623131,0x3f2a1515
+	.long	0x0c080404,0x5295c7c7,0x65462323,0x5e9dc3c3
+	.long	0x28301818,0xa1379696,0x0f0a0505,0xb52f9a9a
+	.long	0x090e0707,0x36241212,0x9b1b8080,0x3ddfe2e2
+	.long	0x26cdebeb,0x694e2727,0xcd7fb2b2,0x9fea7575
+	.long	0x1b120909,0x9e1d8383,0x74582c2c,0x2e341a1a
+	.long	0x2d361b1b,0xb2dc6e6e,0xeeb45a5a,0xfb5ba0a0
+	.long	0xf6a45252,0x4d763b3b,0x61b7d6d6,0xce7db3b3
+	.long	0x7b522929,0x3edde3e3,0x715e2f2f,0x97138484
+	.long	0xf5a65353,0x68b9d1d1,0x00000000,0x2cc1eded
+	.long	0x60402020,0x1fe3fcfc,0xc879b1b1,0xedb65b5b
+	.long	0xbed46a6a,0x468dcbcb,0xd967bebe,0x4b723939
+	.long	0xde944a4a,0xd4984c4c,0xe8b05858,0x4a85cfcf
+	.long	0x6bbbd0d0,0x2ac5efef,0xe54faaaa,0x16edfbfb
+	.long	0xc5864343,0xd79a4d4d,0x55663333,0x94118585
+	.long	0xcf8a4545,0x10e9f9f9,0x06040202,0x81fe7f7f
+	.long	0xf0a05050,0x44783c3c,0xba259f9f,0xe34ba8a8
+	.long	0xf3a25151,0xfe5da3a3,0xc0804040,0x8a058f8f
+	.long	0xad3f9292,0xbc219d9d,0x48703838,0x04f1f5f5
+	.long	0xdf63bcbc,0xc177b6b6,0x75afdada,0x63422121
+	.long	0x30201010,0x1ae5ffff,0x0efdf3f3,0x6dbfd2d2
+	.long	0x4c81cdcd,0x14180c0c,0x35261313,0x2fc3ecec
+	.long	0xe1be5f5f,0xa2359797,0xcc884444,0x392e1717
+	.long	0x5793c4c4,0xf255a7a7,0x82fc7e7e,0x477a3d3d
+	.long	0xacc86464,0xe7ba5d5d,0x2b321919,0x95e67373
+	.long	0xa0c06060,0x98198181,0xd19e4f4f,0x7fa3dcdc
+	.long	0x66442222,0x7e542a2a,0xab3b9090,0x830b8888
+	.long	0xca8c4646,0x29c7eeee,0xd36bb8b8,0x3c281414
+	.long	0x79a7dede,0xe2bc5e5e,0x1d160b0b,0x76addbdb
+	.long	0x3bdbe0e0,0x56643232,0x4e743a3a,0x1e140a0a
+	.long	0xdb924949,0x0a0c0606,0x6c482424,0xe4b85c5c
+	.long	0x5d9fc2c2,0x6ebdd3d3,0xef43acac,0xa6c46262
+	.long	0xa8399191,0xa4319595,0x37d3e4e4,0x8bf27979
+	.long	0x32d5e7e7,0x438bc8c8,0x596e3737,0xb7da6d6d
+	.long	0x8c018d8d,0x64b1d5d5,0xd29c4e4e,0xe049a9a9
+	.long	0xb4d86c6c,0xfaac5656,0x07f3f4f4,0x25cfeaea
+	.long	0xafca6565,0x8ef47a7a,0xe947aeae,0x18100808
+	.long	0xd56fbaba,0x88f07878,0x6f4a2525,0x725c2e2e
+	.long	0x24381c1c,0xf157a6a6,0xc773b4b4,0x5197c6c6
+	.long	0x23cbe8e8,0x7ca1dddd,0x9ce87474,0x213e1f1f
+	.long	0xdd964b4b,0xdc61bdbd,0x860d8b8b,0x850f8a8a
+	.long	0x90e07070,0x427c3e3e,0xc471b5b5,0xaacc6666
+	.long	0xd8904848,0x05060303,0x01f7f6f6,0x121c0e0e
+	.long	0xa3c26161,0x5f6a3535,0xf9ae5757,0xd069b9b9
+	.long	0x91178686,0x5899c1c1,0x273a1d1d,0xb9279e9e
+	.long	0x38d9e1e1,0x13ebf8f8,0xb32b9898,0x33221111
+	.long	0xbbd26969,0x70a9d9d9,0x89078e8e,0xa7339494
+	.long	0xb62d9b9b,0x223c1e1e,0x92158787,0x20c9e9e9
+	.long	0x4987cece,0xffaa5555,0x78502828,0x7aa5dfdf
+	.long	0x8f038c8c,0xf859a1a1,0x80098989,0x171a0d0d
+	.long	0xda65bfbf,0x31d7e6e6,0xc6844242,0xb8d06868
+	.long	0xc3824141,0xb0299999,0x775a2d2d,0x111e0f0f
+	.long	0xcb7bb0b0,0xfca85454,0xd66dbbbb,0x3a2c1616
 #endif /* HAVE_AES_DECRYPT || HAVE_AES_CBC || HAVE_AESCCM || HAVE_AESGCM ||
         * WOLFSSL_AES_DIRECT || WOLFSSL_AES_COUNTER */
 #ifdef HAVE_AES_DECRYPT
@@ -51969,28 +51585,22 @@ L_AES_invert_key_mix_loop:
 #endif /* HAVE_AES_DECRYPT */
 #ifndef __APPLE__
 	.text
-	.type	L_AES_ARM64_rcon, %object
 	.section	.rodata
+	.type	L_AES_ARM64_rcon, %object
 	.size	L_AES_ARM64_rcon, 40
 #else
 	.section	__DATA,__data
 #endif /* __APPLE__ */
+	# 8-byte aligned, 64-bit aligned
 #ifndef __APPLE__
 	.align	3
 #else
 	.p2align	3
 #endif /* __APPLE__ */
 L_AES_ARM64_rcon:
-	.word	0x01000000
-	.word	0x02000000
-	.word	0x04000000
-	.word	0x08000000
-	.word	0x10000000
-	.word	0x20000000
-	.word	0x40000000
-	.word	0x80000000
-	.word	0x1b000000
-	.word	0x36000000
+	.long	0x01000000,0x02000000,0x04000000,0x08000000
+	.long	0x10000000,0x20000000,0x40000000,0x80000000
+	.long	0x1b000000,0x36000000
 #ifndef __APPLE__
 .text
 .globl	AES_set_encrypt_key
@@ -53270,16 +52880,17 @@ L_AES_CTR_encrypt_loop_nr:
     defined(HAVE_AES_CBC) || defined(HAVE_AES_ECB)
 #ifndef __APPLE__
 	.text
-	.type	L_AES_ARM64_td4, %object
 	.section	.rodata
+	.type	L_AES_ARM64_td4, %object
 	.size	L_AES_ARM64_td4, 256
 #else
 	.section	__DATA,__data
 #endif /* __APPLE__ */
+	# 8-byte aligned, 64-bit aligned
 #ifndef __APPLE__
-	.align	1
+	.align	3
 #else
-	.p2align	1
+	.p2align	3
 #endif /* __APPLE__ */
 L_AES_ARM64_td4:
 	.byte	0x52,0x09,0x6a,0xd5,0x30,0x36,0xa5,0x38
@@ -54230,50 +53841,27 @@ L_AES_CBC_decrypt_end_dec:
 #ifdef HAVE_AESGCM
 #ifndef __APPLE__
 	.text
-	.type	L_GCM_gmult_len_r, %object
 	.section	.rodata
+	.type	L_GCM_gmult_len_r, %object
 	.size	L_GCM_gmult_len_r, 128
 #else
 	.section	__DATA,__data
 #endif /* __APPLE__ */
+	# 8-byte aligned, 64-bit aligned
 #ifndef __APPLE__
 	.align	3
 #else
 	.p2align	3
 #endif /* __APPLE__ */
 L_GCM_gmult_len_r:
-	.word	0x00000000
-	.word	0x1c200000
-	.word	0x38400000
-	.word	0x24600000
-	.word	0x70800000
-	.word	0x6ca00000
-	.word	0x48c00000
-	.word	0x54e00000
-	.word	0xe1000000
-	.word	0xfd200000
-	.word	0xd9400000
-	.word	0xc5600000
-	.word	0x91800000
-	.word	0x8da00000
-	.word	0xa9c00000
-	.word	0xb5e00000
-	.word	0x00000000
-	.word	0x01c20000
-	.word	0x03840000
-	.word	0x02460000
-	.word	0x07080000
-	.word	0x06ca0000
-	.word	0x048c0000
-	.word	0x054e0000
-	.word	0x0e100000
-	.word	0x0fd20000
-	.word	0x0d940000
-	.word	0x0c560000
-	.word	0x09180000
-	.word	0x08da0000
-	.word	0x0a9c0000
-	.word	0x0b5e0000
+	.long	0x00000000,0x1c200000,0x38400000,0x24600000
+	.long	0x70800000,0x6ca00000,0x48c00000,0x54e00000
+	.long	0xe1000000,0xfd200000,0xd9400000,0xc5600000
+	.long	0x91800000,0x8da00000,0xa9c00000,0xb5e00000
+	.long	0x00000000,0x01c20000,0x03840000,0x02460000
+	.long	0x07080000,0x06ca0000,0x048c0000,0x054e0000
+	.long	0x0e100000,0x0fd20000,0x0d940000,0x0c560000
+	.long	0x09180000,0x08da0000,0x0a9c0000,0x0b5e0000
 #ifndef __APPLE__
 .text
 .globl	GCM_gmult_len
diff --git a/wolfcrypt/src/port/arm/armv8-aes-asm_c.c b/wolfcrypt/src/port/arm/armv8-aes-asm_c.c
index 26f11a70b71..7c6e43e9729 100644
--- a/wolfcrypt/src/port/arm/armv8-aes-asm_c.c
+++ b/wolfcrypt/src/port/arm/armv8-aes-asm_c.c
@@ -207,7 +207,7 @@ void AES_set_key_AARCH64(const byte* userKey, int keylen, byte* key, int dir)
         "stur	q0, [%x[key], #96]\n\t"
         "b	L_aes_set_key_arm64_crypto_done_%=\n\t"
         "\n"
-    "L_aes_set_key_arm64_crypto_start_256_%=: \n\t"
+    "L_aes_set_key_arm64_crypto_start_256_%=:\n\t"
         "ldr	x4, [%x[userKey]], #8\n\t"
         "ldr	x6, [%x[userKey]], #8\n\t"
         "ldr	x8, [%x[userKey]], #8\n\t"
@@ -410,7 +410,7 @@ void AES_set_key_AARCH64(const byte* userKey, int keylen, byte* key, int dir)
         "stur	q0, [%x[key], #112]\n\t"
         "b	L_aes_set_key_arm64_crypto_done_%=\n\t"
         "\n"
-    "L_aes_set_key_arm64_crypto_start_128_%=: \n\t"
+    "L_aes_set_key_arm64_crypto_start_128_%=:\n\t"
         "ldr	x4, [%x[userKey]], #8\n\t"
         "ldr	x6, [%x[userKey]], #8\n\t"
         "stp	x4, x6, [%x[key]], #16\n\t"
@@ -573,7 +573,7 @@ void AES_set_key_AARCH64(const byte* userKey, int keylen, byte* key, int dir)
         "aesimc	v0.16b, v0.16b\n\t"
         "stur	q0, [%x[key], #80]\n\t"
         "\n"
-    "L_aes_set_key_arm64_crypto_done_%=: \n\t"
+    "L_aes_set_key_arm64_crypto_done_%=:\n\t"
         : [keylen] "+r" (keylen), [key] "+r" (key), [dir] "+r" (dir)
         : [userKey] "r" (userKey)
         : "memory", "cc", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11",
@@ -581,7 +581,9 @@ void AES_set_key_AARCH64(const byte* userKey, int keylen, byte* key, int dir)
     );
 }
 
-#if defined(HAVE_AESCCM) || defined(HAVE_AESGCM) || defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) || defined(HAVE_AES_CBC)
+#if defined(HAVE_AESCCM) || defined(HAVE_AESGCM) || \
+        defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) || \
+        defined(HAVE_AES_CBC)
 void AES_encrypt_AARCH64(const byte* inBlock, byte* outBlock, byte* key, int nr)
 {
     __asm__ __volatile__ (
@@ -623,7 +625,7 @@ void AES_encrypt_AARCH64(const byte* inBlock, byte* outBlock, byte* key, int nr)
         "aesmc	v0.16b, v0.16b\n\t"
         "aese	v0.16b, v2.16b\n\t"
         "\n"
-    "L_aes_encrypt_arm64_crypto_round_done_%=: \n\t"
+    "L_aes_encrypt_arm64_crypto_round_done_%=:\n\t"
         "ld1	{v1.2d}, [%x[key]]\n\t"
         "eor	v0.16b, v0.16b, v1.16b\n\t"
         "st1	{v0.16b}, [%x[outBlock]]\n\t"
@@ -633,8 +635,11 @@ void AES_encrypt_AARCH64(const byte* inBlock, byte* outBlock, byte* key, int nr)
     );
 }
 
-#endif /* defined(HAVE_AESCCM) || defined(HAVE_AESGCM) || defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) || defined(HAVE_AES_CBC) */
-#if !defined(WC_AES_BITSLICED) || defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER)
+#endif /* defined(HAVE_AESCCM) || defined(HAVE_AESGCM) ||
+        * defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) ||
+        * defined(HAVE_AES_CBC) */
+#if !defined(WC_AES_BITSLICED) || defined(WOLFSSL_AES_DIRECT) || \
+        defined(WOLFSSL_AES_COUNTER)
 #ifdef HAVE_AES_DECRYPT
 void AES_decrypt_AARCH64(const byte* inBlock, byte* outBlock, byte* key, int nr)
 {
@@ -677,7 +682,7 @@ void AES_decrypt_AARCH64(const byte* inBlock, byte* outBlock, byte* key, int nr)
         "aesimc	v0.16b, v0.16b\n\t"
         "aesd	v0.16b, v2.16b\n\t"
         "\n"
-    "L_aes_decrypt_arm64_crypto_round_done_%=: \n\t"
+    "L_aes_decrypt_arm64_crypto_round_done_%=:\n\t"
         "ld1	{v1.2d}, [%x[key]]\n\t"
         "eor	v0.16b, v0.16b, v1.16b\n\t"
         "st1	{v0.16b}, [%x[outBlock]]\n\t"
@@ -688,7 +693,8 @@ void AES_decrypt_AARCH64(const byte* inBlock, byte* outBlock, byte* key, int nr)
 }
 
 #endif /* HAVE_AES_DECRYPT */
-#endif /* !defined(WC_AES_BITSLICED) || defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) */
+#endif /* !defined(WC_AES_BITSLICED) || defined(WOLFSSL_AES_DIRECT) ||
+        * defined(WOLFSSL_AES_COUNTER) */
 #ifdef HAVE_AES_ECB
 void AES_encrypt_blocks_AARCH64(const byte* in, byte* out, word32 sz, byte* key,
     int nr)
@@ -709,7 +715,7 @@ void AES_encrypt_blocks_AARCH64(const byte* in, byte* out, word32 sz, byte* key,
         "cmp	%w[sz], #8\n\t"
         "b.lt	L_aes_encrypt_blocks_arm64_crypto_192_start_4_%=\n\t"
         "\n"
-    "L_aes_encrypt_blocks_arm64_crypto_192_start_8_%=: \n\t"
+    "L_aes_encrypt_blocks_arm64_crypto_192_start_8_%=:\n\t"
         "ld1	{v0.16b, v1.16b, v2.16b, v3.16b}, [%x[in]], #0x40\n\t"
         "ld1	{v4.16b, v5.16b, v6.16b, v7.16b}, [%x[in]], #0x40\n\t"
         "aese	v0.16b, v16.16b\n\t"
@@ -910,7 +916,7 @@ void AES_encrypt_blocks_AARCH64(const byte* in, byte* out, word32 sz, byte* key,
         "cmp	%w[sz], #8\n\t"
         "b.ge	L_aes_encrypt_blocks_arm64_crypto_192_start_8_%=\n\t"
         "\n"
-    "L_aes_encrypt_blocks_arm64_crypto_192_start_4_%=: \n\t"
+    "L_aes_encrypt_blocks_arm64_crypto_192_start_4_%=:\n\t"
         "cmp	%w[sz], #4\n\t"
         "b.lt	L_aes_encrypt_blocks_arm64_crypto_192_start_2_%=\n\t"
         "ld1	{v0.16b, v1.16b, v2.16b, v3.16b}, [%x[in]], #0x40\n\t"
@@ -1013,7 +1019,7 @@ void AES_encrypt_blocks_AARCH64(const byte* in, byte* out, word32 sz, byte* key,
         "sub	%w[sz], %w[sz], #4\n\t"
         "st1	{v0.16b, v1.16b, v2.16b, v3.16b}, [%x[out]], #0x40\n\t"
         "\n"
-    "L_aes_encrypt_blocks_arm64_crypto_192_start_2_%=: \n\t"
+    "L_aes_encrypt_blocks_arm64_crypto_192_start_2_%=:\n\t"
         "cmp	%w[sz], #2\n\t"
         "b.lt	L_aes_encrypt_blocks_arm64_crypto_192_start_1_%=\n\t"
         "ld1	{v0.16b, v1.16b}, [%x[in]], #32\n\t"
@@ -1068,7 +1074,7 @@ void AES_encrypt_blocks_AARCH64(const byte* in, byte* out, word32 sz, byte* key,
         "sub	%w[sz], %w[sz], #2\n\t"
         "st1	{v0.16b, v1.16b}, [%x[out]], #32\n\t"
         "\n"
-    "L_aes_encrypt_blocks_arm64_crypto_192_start_1_%=: \n\t"
+    "L_aes_encrypt_blocks_arm64_crypto_192_start_1_%=:\n\t"
         "cbz	%w[sz], L_aes_encrypt_blocks_arm64_crypto_192_done_%=\n\t"
         "ld1	{v0.16b}, [%x[in]], #16\n\t"
         "aese	v0.16b, v16.16b\n\t"
@@ -1097,12 +1103,12 @@ void AES_encrypt_blocks_AARCH64(const byte* in, byte* out, word32 sz, byte* key,
         "eor	v0.16b, v0.16b, v28.16b\n\t"
         "st1	{v0.16b}, [%x[out]], #16\n\t"
         "\n"
-    "L_aes_encrypt_blocks_arm64_crypto_192_done_%=: \n\t"
+    "L_aes_encrypt_blocks_arm64_crypto_192_done_%=:\n\t"
 #endif /* !NO_AES_192 */
         "b	L_aes_encrypt_blocks_arm64_crypto_done_%=\n\t"
         /* AES_ECB_256 */
         "\n"
-    "L_aes_encrypt_blocks_arm64_crypto_start_256_%=: \n\t"
+    "L_aes_encrypt_blocks_arm64_crypto_start_256_%=:\n\t"
 #ifndef NO_AES_256
         "ld1	{v27.2d, v28.2d, v29.2d, v30.2d}, [%x[key]], #0x40\n\t"
         "cmp	%w[sz], #1\n\t"
@@ -1110,7 +1116,7 @@ void AES_encrypt_blocks_AARCH64(const byte* in, byte* out, word32 sz, byte* key,
         "cmp	%w[sz], #8\n\t"
         "b.lt	L_aes_encrypt_blocks_arm64_crypto_256_start_4_%=\n\t"
         "\n"
-    "L_aes_encrypt_blocks_arm64_crypto_256_start_8_%=: \n\t"
+    "L_aes_encrypt_blocks_arm64_crypto_256_start_8_%=:\n\t"
         "ld1	{v0.16b, v1.16b, v2.16b, v3.16b}, [%x[in]], #0x40\n\t"
         "ld1	{v4.16b, v5.16b, v6.16b, v7.16b}, [%x[in]], #0x40\n\t"
         "aese	v0.16b, v16.16b\n\t"
@@ -1343,7 +1349,7 @@ void AES_encrypt_blocks_AARCH64(const byte* in, byte* out, word32 sz, byte* key,
         "cmp	%w[sz], #8\n\t"
         "b.ge	L_aes_encrypt_blocks_arm64_crypto_256_start_8_%=\n\t"
         "\n"
-    "L_aes_encrypt_blocks_arm64_crypto_256_start_4_%=: \n\t"
+    "L_aes_encrypt_blocks_arm64_crypto_256_start_4_%=:\n\t"
         "cmp	%w[sz], #4\n\t"
         "b.lt	L_aes_encrypt_blocks_arm64_crypto_256_start_2_%=\n\t"
         "ld1	{v0.16b, v1.16b, v2.16b, v3.16b}, [%x[in]], #0x40\n\t"
@@ -1462,7 +1468,7 @@ void AES_encrypt_blocks_AARCH64(const byte* in, byte* out, word32 sz, byte* key,
         "sub	%w[sz], %w[sz], #4\n\t"
         "st1	{v0.16b, v1.16b, v2.16b, v3.16b}, [%x[out]], #0x40\n\t"
         "\n"
-    "L_aes_encrypt_blocks_arm64_crypto_256_start_2_%=: \n\t"
+    "L_aes_encrypt_blocks_arm64_crypto_256_start_2_%=:\n\t"
         "cmp	%w[sz], #2\n\t"
         "b.lt	L_aes_encrypt_blocks_arm64_crypto_256_start_1_%=\n\t"
         "ld1	{v0.16b, v1.16b}, [%x[in]], #32\n\t"
@@ -1525,7 +1531,7 @@ void AES_encrypt_blocks_AARCH64(const byte* in, byte* out, word32 sz, byte* key,
         "sub	%w[sz], %w[sz], #2\n\t"
         "st1	{v0.16b, v1.16b}, [%x[out]], #32\n\t"
         "\n"
-    "L_aes_encrypt_blocks_arm64_crypto_256_start_1_%=: \n\t"
+    "L_aes_encrypt_blocks_arm64_crypto_256_start_1_%=:\n\t"
         "cbz	%w[sz], L_aes_encrypt_blocks_arm64_crypto_256_done_%=\n\t"
         "ld1	{v0.16b}, [%x[in]], #16\n\t"
         "aese	v0.16b, v16.16b\n\t"
@@ -1558,19 +1564,19 @@ void AES_encrypt_blocks_AARCH64(const byte* in, byte* out, word32 sz, byte* key,
         "eor	v0.16b, v0.16b, v30.16b\n\t"
         "st1	{v0.16b}, [%x[out]], #16\n\t"
         "\n"
-    "L_aes_encrypt_blocks_arm64_crypto_256_done_%=: \n\t"
+    "L_aes_encrypt_blocks_arm64_crypto_256_done_%=:\n\t"
 #endif /* !NO_AES_256 */
         "b	L_aes_encrypt_blocks_arm64_crypto_done_%=\n\t"
         /* AES_ECB_128 */
         "\n"
-    "L_aes_encrypt_blocks_arm64_crypto_start_128_%=: \n\t"
+    "L_aes_encrypt_blocks_arm64_crypto_start_128_%=:\n\t"
 #ifndef NO_AES_128
         "cmp	%w[sz], #1\n\t"
         "b.eq	L_aes_encrypt_blocks_arm64_crypto_128_start_1_%=\n\t"
         "cmp	%w[sz], #8\n\t"
         "b.lt	L_aes_encrypt_blocks_arm64_crypto_128_start_4_%=\n\t"
         "\n"
-    "L_aes_encrypt_blocks_arm64_crypto_128_start_8_%=: \n\t"
+    "L_aes_encrypt_blocks_arm64_crypto_128_start_8_%=:\n\t"
         "ld1	{v0.16b, v1.16b, v2.16b, v3.16b}, [%x[in]], #0x40\n\t"
         "ld1	{v4.16b, v5.16b, v6.16b, v7.16b}, [%x[in]], #0x40\n\t"
         "aese	v0.16b, v16.16b\n\t"
@@ -1739,7 +1745,7 @@ void AES_encrypt_blocks_AARCH64(const byte* in, byte* out, word32 sz, byte* key,
         "cmp	%w[sz], #8\n\t"
         "b.ge	L_aes_encrypt_blocks_arm64_crypto_128_start_8_%=\n\t"
         "\n"
-    "L_aes_encrypt_blocks_arm64_crypto_128_start_4_%=: \n\t"
+    "L_aes_encrypt_blocks_arm64_crypto_128_start_4_%=:\n\t"
         "cmp	%w[sz], #4\n\t"
         "b.lt	L_aes_encrypt_blocks_arm64_crypto_128_start_2_%=\n\t"
         "ld1	{v0.16b, v1.16b, v2.16b, v3.16b}, [%x[in]], #0x40\n\t"
@@ -1826,7 +1832,7 @@ void AES_encrypt_blocks_AARCH64(const byte* in, byte* out, word32 sz, byte* key,
         "sub	%w[sz], %w[sz], #4\n\t"
         "st1	{v0.16b, v1.16b, v2.16b, v3.16b}, [%x[out]], #0x40\n\t"
         "\n"
-    "L_aes_encrypt_blocks_arm64_crypto_128_start_2_%=: \n\t"
+    "L_aes_encrypt_blocks_arm64_crypto_128_start_2_%=:\n\t"
         "cmp	%w[sz], #2\n\t"
         "b.lt	L_aes_encrypt_blocks_arm64_crypto_128_start_1_%=\n\t"
         "ld1	{v0.16b, v1.16b}, [%x[in]], #32\n\t"
@@ -1873,7 +1879,7 @@ void AES_encrypt_blocks_AARCH64(const byte* in, byte* out, word32 sz, byte* key,
         "sub	%w[sz], %w[sz], #2\n\t"
         "st1	{v0.16b, v1.16b}, [%x[out]], #32\n\t"
         "\n"
-    "L_aes_encrypt_blocks_arm64_crypto_128_start_1_%=: \n\t"
+    "L_aes_encrypt_blocks_arm64_crypto_128_start_1_%=:\n\t"
         "cbz	%w[sz], L_aes_encrypt_blocks_arm64_crypto_128_done_%=\n\t"
         "ld1	{v0.16b}, [%x[in]], #16\n\t"
         "aese	v0.16b, v16.16b\n\t"
@@ -1898,10 +1904,10 @@ void AES_encrypt_blocks_AARCH64(const byte* in, byte* out, word32 sz, byte* key,
         "eor	v0.16b, v0.16b, v26.16b\n\t"
         "st1	{v0.16b}, [%x[out]], #16\n\t"
         "\n"
-    "L_aes_encrypt_blocks_arm64_crypto_128_done_%=: \n\t"
+    "L_aes_encrypt_blocks_arm64_crypto_128_done_%=:\n\t"
 #endif /* !NO_AES_128 */
         "\n"
-    "L_aes_encrypt_blocks_arm64_crypto_done_%=: \n\t"
+    "L_aes_encrypt_blocks_arm64_crypto_done_%=:\n\t"
         : [out] "+r" (out), [sz] "+r" (sz), [key] "+r" (key), [nr] "+r" (nr)
         : [in] "r" (in)
         : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
@@ -1930,7 +1936,7 @@ void AES_decrypt_blocks_AARCH64(const byte* in, byte* out, word32 sz, byte* key,
         "cmp	%w[sz], #8\n\t"
         "b.lt	L_aes_decrypt_blocks_arm64_crypto_192_start_4_%=\n\t"
         "\n"
-    "L_aes_decrypt_blocks_arm64_crypto_192_start_8_%=: \n\t"
+    "L_aes_decrypt_blocks_arm64_crypto_192_start_8_%=:\n\t"
         "ld1	{v0.16b, v1.16b, v2.16b, v3.16b}, [%x[in]], #0x40\n\t"
         "ld1	{v4.16b, v5.16b, v6.16b, v7.16b}, [%x[in]], #0x40\n\t"
         "aesd	v0.16b, v16.16b\n\t"
@@ -2131,7 +2137,7 @@ void AES_decrypt_blocks_AARCH64(const byte* in, byte* out, word32 sz, byte* key,
         "cmp	%w[sz], #8\n\t"
         "b.ge	L_aes_decrypt_blocks_arm64_crypto_192_start_8_%=\n\t"
         "\n"
-    "L_aes_decrypt_blocks_arm64_crypto_192_start_4_%=: \n\t"
+    "L_aes_decrypt_blocks_arm64_crypto_192_start_4_%=:\n\t"
         "cmp	%w[sz], #4\n\t"
         "b.lt	L_aes_decrypt_blocks_arm64_crypto_192_start_2_%=\n\t"
         "ld1	{v0.16b, v1.16b, v2.16b, v3.16b}, [%x[in]], #0x40\n\t"
@@ -2234,7 +2240,7 @@ void AES_decrypt_blocks_AARCH64(const byte* in, byte* out, word32 sz, byte* key,
         "sub	%w[sz], %w[sz], #4\n\t"
         "st1	{v0.16b, v1.16b, v2.16b, v3.16b}, [%x[out]], #0x40\n\t"
         "\n"
-    "L_aes_decrypt_blocks_arm64_crypto_192_start_2_%=: \n\t"
+    "L_aes_decrypt_blocks_arm64_crypto_192_start_2_%=:\n\t"
         "cmp	%w[sz], #2\n\t"
         "b.lt	L_aes_decrypt_blocks_arm64_crypto_192_start_1_%=\n\t"
         "ld1	{v0.16b, v1.16b}, [%x[in]], #32\n\t"
@@ -2289,7 +2295,7 @@ void AES_decrypt_blocks_AARCH64(const byte* in, byte* out, word32 sz, byte* key,
         "sub	%w[sz], %w[sz], #2\n\t"
         "st1	{v0.16b, v1.16b}, [%x[out]], #32\n\t"
         "\n"
-    "L_aes_decrypt_blocks_arm64_crypto_192_start_1_%=: \n\t"
+    "L_aes_decrypt_blocks_arm64_crypto_192_start_1_%=:\n\t"
         "cbz	%w[sz], L_aes_decrypt_blocks_arm64_crypto_192_done_%=\n\t"
         "ld1	{v0.16b}, [%x[in]], #16\n\t"
         "aesd	v0.16b, v16.16b\n\t"
@@ -2318,12 +2324,12 @@ void AES_decrypt_blocks_AARCH64(const byte* in, byte* out, word32 sz, byte* key,
         "eor	v0.16b, v0.16b, v28.16b\n\t"
         "st1	{v0.16b}, [%x[out]], #16\n\t"
         "\n"
-    "L_aes_decrypt_blocks_arm64_crypto_192_done_%=: \n\t"
+    "L_aes_decrypt_blocks_arm64_crypto_192_done_%=:\n\t"
 #endif /* !NO_AES_192 */
         "b	L_aes_decrypt_blocks_arm64_crypto_done_%=\n\t"
         /* AES_ECB_256 */
         "\n"
-    "L_aes_decrypt_blocks_arm64_crypto_start_256_%=: \n\t"
+    "L_aes_decrypt_blocks_arm64_crypto_start_256_%=:\n\t"
 #ifndef NO_AES_256
         "ld1	{v27.2d, v28.2d, v29.2d, v30.2d}, [%x[key]], #0x40\n\t"
         "cmp	%w[sz], #1\n\t"
@@ -2331,7 +2337,7 @@ void AES_decrypt_blocks_AARCH64(const byte* in, byte* out, word32 sz, byte* key,
         "cmp	%w[sz], #8\n\t"
         "b.lt	L_aes_decrypt_blocks_arm64_crypto_256_start_4_%=\n\t"
         "\n"
-    "L_aes_decrypt_blocks_arm64_crypto_256_start_8_%=: \n\t"
+    "L_aes_decrypt_blocks_arm64_crypto_256_start_8_%=:\n\t"
         "ld1	{v0.16b, v1.16b, v2.16b, v3.16b}, [%x[in]], #0x40\n\t"
         "ld1	{v4.16b, v5.16b, v6.16b, v7.16b}, [%x[in]], #0x40\n\t"
         "aesd	v0.16b, v16.16b\n\t"
@@ -2564,7 +2570,7 @@ void AES_decrypt_blocks_AARCH64(const byte* in, byte* out, word32 sz, byte* key,
         "cmp	%w[sz], #8\n\t"
         "b.ge	L_aes_decrypt_blocks_arm64_crypto_256_start_8_%=\n\t"
         "\n"
-    "L_aes_decrypt_blocks_arm64_crypto_256_start_4_%=: \n\t"
+    "L_aes_decrypt_blocks_arm64_crypto_256_start_4_%=:\n\t"
         "cmp	%w[sz], #4\n\t"
         "b.lt	L_aes_decrypt_blocks_arm64_crypto_256_start_2_%=\n\t"
         "ld1	{v0.16b, v1.16b, v2.16b, v3.16b}, [%x[in]], #0x40\n\t"
@@ -2683,7 +2689,7 @@ void AES_decrypt_blocks_AARCH64(const byte* in, byte* out, word32 sz, byte* key,
         "sub	%w[sz], %w[sz], #4\n\t"
         "st1	{v0.16b, v1.16b, v2.16b, v3.16b}, [%x[out]], #0x40\n\t"
         "\n"
-    "L_aes_decrypt_blocks_arm64_crypto_256_start_2_%=: \n\t"
+    "L_aes_decrypt_blocks_arm64_crypto_256_start_2_%=:\n\t"
         "cmp	%w[sz], #2\n\t"
         "b.lt	L_aes_decrypt_blocks_arm64_crypto_256_start_1_%=\n\t"
         "ld1	{v0.16b, v1.16b}, [%x[in]], #32\n\t"
@@ -2746,7 +2752,7 @@ void AES_decrypt_blocks_AARCH64(const byte* in, byte* out, word32 sz, byte* key,
         "sub	%w[sz], %w[sz], #2\n\t"
         "st1	{v0.16b, v1.16b}, [%x[out]], #32\n\t"
         "\n"
-    "L_aes_decrypt_blocks_arm64_crypto_256_start_1_%=: \n\t"
+    "L_aes_decrypt_blocks_arm64_crypto_256_start_1_%=:\n\t"
         "cbz	%w[sz], L_aes_decrypt_blocks_arm64_crypto_256_done_%=\n\t"
         "ld1	{v0.16b}, [%x[in]], #16\n\t"
         "aesd	v0.16b, v16.16b\n\t"
@@ -2779,19 +2785,19 @@ void AES_decrypt_blocks_AARCH64(const byte* in, byte* out, word32 sz, byte* key,
         "eor	v0.16b, v0.16b, v30.16b\n\t"
         "st1	{v0.16b}, [%x[out]], #16\n\t"
         "\n"
-    "L_aes_decrypt_blocks_arm64_crypto_256_done_%=: \n\t"
+    "L_aes_decrypt_blocks_arm64_crypto_256_done_%=:\n\t"
 #endif /* !NO_AES_256 */
         "b	L_aes_decrypt_blocks_arm64_crypto_done_%=\n\t"
         /* AES_ECB_128 */
         "\n"
-    "L_aes_decrypt_blocks_arm64_crypto_start_128_%=: \n\t"
+    "L_aes_decrypt_blocks_arm64_crypto_start_128_%=:\n\t"
 #ifndef NO_AES_128
         "cmp	%w[sz], #1\n\t"
         "b.eq	L_aes_decrypt_blocks_arm64_crypto_128_start_1_%=\n\t"
         "cmp	%w[sz], #8\n\t"
         "b.lt	L_aes_decrypt_blocks_arm64_crypto_128_start_4_%=\n\t"
         "\n"
-    "L_aes_decrypt_blocks_arm64_crypto_128_start_8_%=: \n\t"
+    "L_aes_decrypt_blocks_arm64_crypto_128_start_8_%=:\n\t"
         "ld1	{v0.16b, v1.16b, v2.16b, v3.16b}, [%x[in]], #0x40\n\t"
         "ld1	{v4.16b, v5.16b, v6.16b, v7.16b}, [%x[in]], #0x40\n\t"
         "aesd	v0.16b, v16.16b\n\t"
@@ -2960,7 +2966,7 @@ void AES_decrypt_blocks_AARCH64(const byte* in, byte* out, word32 sz, byte* key,
         "cmp	%w[sz], #8\n\t"
         "b.ge	L_aes_decrypt_blocks_arm64_crypto_128_start_8_%=\n\t"
         "\n"
-    "L_aes_decrypt_blocks_arm64_crypto_128_start_4_%=: \n\t"
+    "L_aes_decrypt_blocks_arm64_crypto_128_start_4_%=:\n\t"
         "cmp	%w[sz], #4\n\t"
         "b.lt	L_aes_decrypt_blocks_arm64_crypto_128_start_2_%=\n\t"
         "ld1	{v0.16b, v1.16b, v2.16b, v3.16b}, [%x[in]], #0x40\n\t"
@@ -3047,7 +3053,7 @@ void AES_decrypt_blocks_AARCH64(const byte* in, byte* out, word32 sz, byte* key,
         "sub	%w[sz], %w[sz], #4\n\t"
         "st1	{v0.16b, v1.16b, v2.16b, v3.16b}, [%x[out]], #0x40\n\t"
         "\n"
-    "L_aes_decrypt_blocks_arm64_crypto_128_start_2_%=: \n\t"
+    "L_aes_decrypt_blocks_arm64_crypto_128_start_2_%=:\n\t"
         "cmp	%w[sz], #2\n\t"
         "b.lt	L_aes_decrypt_blocks_arm64_crypto_128_start_1_%=\n\t"
         "ld1	{v0.16b, v1.16b}, [%x[in]], #32\n\t"
@@ -3094,7 +3100,7 @@ void AES_decrypt_blocks_AARCH64(const byte* in, byte* out, word32 sz, byte* key,
         "sub	%w[sz], %w[sz], #2\n\t"
         "st1	{v0.16b, v1.16b}, [%x[out]], #32\n\t"
         "\n"
-    "L_aes_decrypt_blocks_arm64_crypto_128_start_1_%=: \n\t"
+    "L_aes_decrypt_blocks_arm64_crypto_128_start_1_%=:\n\t"
         "cbz	%w[sz], L_aes_decrypt_blocks_arm64_crypto_128_done_%=\n\t"
         "ld1	{v0.16b}, [%x[in]], #16\n\t"
         "aesd	v0.16b, v16.16b\n\t"
@@ -3119,10 +3125,10 @@ void AES_decrypt_blocks_AARCH64(const byte* in, byte* out, word32 sz, byte* key,
         "eor	v0.16b, v0.16b, v26.16b\n\t"
         "st1	{v0.16b}, [%x[out]], #16\n\t"
         "\n"
-    "L_aes_decrypt_blocks_arm64_crypto_128_done_%=: \n\t"
+    "L_aes_decrypt_blocks_arm64_crypto_128_done_%=:\n\t"
 #endif /* !NO_AES_128 */
         "\n"
-    "L_aes_decrypt_blocks_arm64_crypto_done_%=: \n\t"
+    "L_aes_decrypt_blocks_arm64_crypto_done_%=:\n\t"
         : [out] "+r" (out), [sz] "+r" (sz), [key] "+r" (key), [nr] "+r" (nr)
         : [in] "r" (in)
         : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
@@ -3149,7 +3155,7 @@ void AES_CBC_encrypt_AARCH64(const byte* in, byte* out, word32 sz, byte* reg,
 #ifndef NO_AES_192
         "ld1	{v24.2d, v25.2d, v26.2d, v27.2d}, [%x[key]], #0x40\n\t"
         "\n"
-    "L_aes_cbc_encrypt_arm64_crypto_loop_192_%=: \n\t"
+    "L_aes_cbc_encrypt_arm64_crypto_loop_192_%=:\n\t"
         "ld1	{v28.2d}, [%x[key]]\n\t"
         "ld1	{v1.16b}, [%x[in]], #16\n\t"
         "subs	%w[sz], %w[sz], #1\n\t"
@@ -3184,12 +3190,12 @@ void AES_CBC_encrypt_AARCH64(const byte* in, byte* out, word32 sz, byte* reg,
         "b	L_aes_cbc_encrypt_arm64_crypto_done_%=\n\t"
         /* AES_CBC_256 */
         "\n"
-    "L_aes_cbc_encrypt_arm64_crypto_start_256_%=: \n\t"
+    "L_aes_cbc_encrypt_arm64_crypto_start_256_%=:\n\t"
 #ifndef NO_AES_256
         "ld1	{v24.2d, v25.2d, v26.2d, v27.2d}, [%x[key]], #0x40\n\t"
         "ld1	{v28.2d, v29.2d}, [%x[key]], #32\n\t"
         "\n"
-    "L_aes_cbc_encrypt_arm64_crypto_loop_256_%=: \n\t"
+    "L_aes_cbc_encrypt_arm64_crypto_loop_256_%=:\n\t"
         "ld1	{v30.2d}, [%x[key]]\n\t"
         "ld1	{v1.16b}, [%x[in]], #16\n\t"
         "subs	%w[sz], %w[sz], #1\n\t"
@@ -3228,11 +3234,11 @@ void AES_CBC_encrypt_AARCH64(const byte* in, byte* out, word32 sz, byte* reg,
         "b	L_aes_cbc_encrypt_arm64_crypto_done_%=\n\t"
         /* AES_CBC_128 */
         "\n"
-    "L_aes_cbc_encrypt_arm64_crypto_start_128_%=: \n\t"
+    "L_aes_cbc_encrypt_arm64_crypto_start_128_%=:\n\t"
 #ifndef NO_AES_128
         "ld1	{v24.2d, v25.2d}, [%x[key]], #32\n\t"
         "\n"
-    "L_aes_cbc_encrypt_arm64_crypto_loop_128_%=: \n\t"
+    "L_aes_cbc_encrypt_arm64_crypto_loop_128_%=:\n\t"
         "ld1	{v26.2d}, [%x[key]]\n\t"
         "ld1	{v1.16b}, [%x[in]], #16\n\t"
         "subs	%w[sz], %w[sz], #1\n\t"
@@ -3261,7 +3267,7 @@ void AES_CBC_encrypt_AARCH64(const byte* in, byte* out, word32 sz, byte* reg,
         "b.ne	L_aes_cbc_encrypt_arm64_crypto_loop_128_%=\n\t"
 #endif /* !NO_AES_128 */
         "\n"
-    "L_aes_cbc_encrypt_arm64_crypto_done_%=: \n\t"
+    "L_aes_cbc_encrypt_arm64_crypto_done_%=:\n\t"
         "st1	{v0.2d}, [%x[reg]]\n\t"
         : [out] "+r" (out), [sz] "+r" (sz), [reg] "+r" (reg), [key] "+r" (key),
           [nr] "+r" (nr)
@@ -3290,7 +3296,7 @@ void AES_CBC_decrypt_AARCH64(const byte* in, byte* out, word32 sz, byte* reg,
         "cmp	%w[sz], #10\n\t"
         "b.le	L_aes_cbc_decrypt_blocks_arm64_crypto_192_start_1_%=\n\t"
         "\n"
-    "L_aes_cbc_decrypt_blocks_arm64_crypto_192_start_1_long_%=: \n\t"
+    "L_aes_cbc_decrypt_blocks_arm64_crypto_192_start_1_long_%=:\n\t"
         "ld1	{v1.16b}, [%x[in]], #16\n\t"
         "sub	%w[sz], %w[sz], #1\n\t"
         "mov	v2.16b, v1.16b\n\t"
@@ -3325,7 +3331,7 @@ void AES_CBC_decrypt_AARCH64(const byte* in, byte* out, word32 sz, byte* reg,
         "b.ge	L_aes_cbc_decrypt_blocks_arm64_crypto_192_start_1_long_%=\n\t"
         "b	L_aes_cbc_decrypt_blocks_arm64_crypto_done_%=\n\t"
         "\n"
-    "L_aes_cbc_decrypt_blocks_arm64_crypto_192_start_1_%=: \n\t"
+    "L_aes_cbc_decrypt_blocks_arm64_crypto_192_start_1_%=:\n\t"
         "ld1	{v1.16b}, [%x[in]], #16\n\t"
         "sub	%w[sz], %w[sz], #1\n\t"
         "eor	v2.16b, v0.16b, v28.16b\n\t"
@@ -3361,7 +3367,7 @@ void AES_CBC_decrypt_AARCH64(const byte* in, byte* out, word32 sz, byte* reg,
         "b	L_aes_cbc_decrypt_blocks_arm64_crypto_done_%=\n\t"
         /* AES_CBC_256 */
         "\n"
-    "L_aes_cbc_decrypt_blocks_arm64_crypto_start_256_%=: \n\t"
+    "L_aes_cbc_decrypt_blocks_arm64_crypto_start_256_%=:\n\t"
 #ifndef NO_AES_256
         "ld1	{v24.2d, v25.2d, v26.2d, v27.2d}, [%x[key]], #0x40\n\t"
         "ld1	{v28.2d, v29.2d}, [%x[key]], #32\n\t"
@@ -3369,7 +3375,7 @@ void AES_CBC_decrypt_AARCH64(const byte* in, byte* out, word32 sz, byte* reg,
         "cmp	%w[sz], #5\n\t"
         "b.le	L_aes_cbc_decrypt_blocks_arm64_crypto_256_start_1_%=\n\t"
         "\n"
-    "L_aes_cbc_decrypt_blocks_arm64_crypto_256_start_1_long_%=: \n\t"
+    "L_aes_cbc_decrypt_blocks_arm64_crypto_256_start_1_long_%=:\n\t"
         "ld1	{v1.16b}, [%x[in]], #16\n\t"
         "sub	%w[sz], %w[sz], #1\n\t"
         "mov	v2.16b, v1.16b\n\t"
@@ -3408,7 +3414,7 @@ void AES_CBC_decrypt_AARCH64(const byte* in, byte* out, word32 sz, byte* reg,
         "b.ge	L_aes_cbc_decrypt_blocks_arm64_crypto_256_start_1_long_%=\n\t"
         "b	L_aes_cbc_decrypt_blocks_arm64_crypto_done_%=\n\t"
         "\n"
-    "L_aes_cbc_decrypt_blocks_arm64_crypto_256_start_1_%=: \n\t"
+    "L_aes_cbc_decrypt_blocks_arm64_crypto_256_start_1_%=:\n\t"
         "ld1	{v1.16b}, [%x[in]], #16\n\t"
         "sub	%w[sz], %w[sz], #1\n\t"
         "eor	v2.16b, v0.16b, v30.16b\n\t"
@@ -3448,14 +3454,14 @@ void AES_CBC_decrypt_AARCH64(const byte* in, byte* out, word32 sz, byte* reg,
         "b	L_aes_cbc_decrypt_blocks_arm64_crypto_done_%=\n\t"
         /* AES_CBC_128 */
         "\n"
-    "L_aes_cbc_decrypt_blocks_arm64_crypto_start_128_%=: \n\t"
+    "L_aes_cbc_decrypt_blocks_arm64_crypto_start_128_%=:\n\t"
 #ifndef NO_AES_128
         "ld1	{v24.2d, v25.2d}, [%x[key]], #32\n\t"
         "ld1	{v26.2d}, [%x[key]]\n\t"
         "cmp	%w[sz], #24\n\t"
         "b.le	L_aes_cbc_decrypt_blocks_arm64_crypto_128_start_1_%=\n\t"
         "\n"
-    "L_aes_cbc_decrypt_blocks_arm64_crypto_128_start_1_long_%=: \n\t"
+    "L_aes_cbc_decrypt_blocks_arm64_crypto_128_start_1_long_%=:\n\t"
         "ld1	{v1.16b}, [%x[in]], #16\n\t"
         "sub	%w[sz], %w[sz], #1\n\t"
         "mov	v2.16b, v1.16b\n\t"
@@ -3486,7 +3492,7 @@ void AES_CBC_decrypt_AARCH64(const byte* in, byte* out, word32 sz, byte* reg,
         "b.ge	L_aes_cbc_decrypt_blocks_arm64_crypto_128_start_1_long_%=\n\t"
         "b	L_aes_cbc_decrypt_blocks_arm64_crypto_done_%=\n\t"
         "\n"
-    "L_aes_cbc_decrypt_blocks_arm64_crypto_128_start_1_%=: \n\t"
+    "L_aes_cbc_decrypt_blocks_arm64_crypto_128_start_1_%=:\n\t"
         "ld1	{v1.16b}, [%x[in]], #16\n\t"
         "sub	%w[sz], %w[sz], #1\n\t"
         "eor	v2.16b, v0.16b, v26.16b\n\t"
@@ -3516,7 +3522,7 @@ void AES_CBC_decrypt_AARCH64(const byte* in, byte* out, word32 sz, byte* reg,
         "b.ge	L_aes_cbc_decrypt_blocks_arm64_crypto_128_start_1_%=\n\t"
 #endif /* !NO_AES_128 */
         "\n"
-    "L_aes_cbc_decrypt_blocks_arm64_crypto_done_%=: \n\t"
+    "L_aes_cbc_decrypt_blocks_arm64_crypto_done_%=:\n\t"
         "st1	{v0.2d}, [%x[reg]]\n\t"
         : [out] "+r" (out), [sz] "+r" (sz), [reg] "+r" (reg), [key] "+r" (key),
           [nr] "+r" (nr)
@@ -3562,7 +3568,7 @@ void AES_CTR_encrypt_AARCH64(const byte* in, byte* out, word32 sz, byte* reg,
         "adds	x15, x9, #3\n\t"
         "adc	x16, x10, xzr\n\t"
         "\n"
-    "L_aes_ctr_encrypt_arm64_crypto_192_start_8_%=: \n\t"
+    "L_aes_ctr_encrypt_arm64_crypto_192_start_8_%=:\n\t"
         "ld1	{v24.16b, v25.16b, v26.16b, v27.16b}, [%x[in]], #0x40\n\t"
         "ld1	{v28.16b, v29.16b, v30.16b, v31.16b}, [%x[in]], #0x40\n\t"
         "mov	v17.d[0], x12\n\t"
@@ -3811,7 +3817,7 @@ void AES_CTR_encrypt_AARCH64(const byte* in, byte* out, word32 sz, byte* reg,
         "cmp	w8, #8\n\t"
         "b.ge	L_aes_ctr_encrypt_arm64_crypto_192_start_8_%=\n\t"
         "\n"
-    "L_aes_ctr_encrypt_arm64_crypto_192_start_4_%=: \n\t"
+    "L_aes_ctr_encrypt_arm64_crypto_192_start_4_%=:\n\t"
         "cmp	w8, #4\n\t"
         "b.lt	L_aes_ctr_encrypt_arm64_crypto_192_start_2_%=\n\t"
         "ld1	{v24.16b, v25.16b, v26.16b, v27.16b}, [%x[in]], #0x40\n\t"
@@ -3938,7 +3944,7 @@ void AES_CTR_encrypt_AARCH64(const byte* in, byte* out, word32 sz, byte* reg,
         "mov	v16.d[0], x10\n\t"
         "mov	v16.d[1], x9\n\t"
         "\n"
-    "L_aes_ctr_encrypt_arm64_crypto_192_start_2_%=: \n\t"
+    "L_aes_ctr_encrypt_arm64_crypto_192_start_2_%=:\n\t"
         "cmp	w8, #2\n\t"
         "b.lt	L_aes_ctr_encrypt_arm64_crypto_192_start_1_%=\n\t"
         "ld1	{v24.16b, v25.16b}, [%x[in]], #32\n\t"
@@ -4010,7 +4016,7 @@ void AES_CTR_encrypt_AARCH64(const byte* in, byte* out, word32 sz, byte* reg,
         "mov	v16.d[0], x10\n\t"
         "mov	v16.d[1], x9\n\t"
         "\n"
-    "L_aes_ctr_encrypt_arm64_crypto_192_start_1_%=: \n\t"
+    "L_aes_ctr_encrypt_arm64_crypto_192_start_1_%=:\n\t"
         "cbz	w8, L_aes_ctr_encrypt_arm64_crypto_192_done_%=\n\t"
         "ld1	{v24.16b}, [%x[in]], #16\n\t"
         "rev64	v16.16b, v16.16b\n\t"
@@ -4043,7 +4049,7 @@ void AES_CTR_encrypt_AARCH64(const byte* in, byte* out, word32 sz, byte* reg,
         "adc	x10, x10, xzr\n\t"
         "st1	{v24.16b}, [%x[out]], #16\n\t"
         "\n"
-    "L_aes_ctr_encrypt_arm64_crypto_192_done_%=: \n\t"
+    "L_aes_ctr_encrypt_arm64_crypto_192_done_%=:\n\t"
         "cbz	%w[sz], L_aes_ctr_encrypt_arm64_crypto_192_partial_done_%=\n\t"
         "mov	v16.d[0], x10\n\t"
         "mov	v16.d[1], x9\n\t"
@@ -4078,7 +4084,7 @@ void AES_CTR_encrypt_AARCH64(const byte* in, byte* out, word32 sz, byte* reg,
         "mov	w13, #16\n\t"
         "sub	w13, w13, %w[sz]\n\t"
         "\n"
-    "L_aes_ctr_encrypt_arm64_crypto_192_start_byte_%=: \n\t"
+    "L_aes_ctr_encrypt_arm64_crypto_192_start_byte_%=:\n\t"
         "ldrb	w11, [%x[tmp]], #1\n\t"
         "ldrb	w12, [%x[in]], #1\n\t"
         "eor	w11, w11, w12\n\t"
@@ -4087,12 +4093,12 @@ void AES_CTR_encrypt_AARCH64(const byte* in, byte* out, word32 sz, byte* reg,
         "b.gt	L_aes_ctr_encrypt_arm64_crypto_192_start_byte_%=\n\t"
         "str	w13, [%x[left]]\n\t"
         "\n"
-    "L_aes_ctr_encrypt_arm64_crypto_192_partial_done_%=: \n\t"
+    "L_aes_ctr_encrypt_arm64_crypto_192_partial_done_%=:\n\t"
 #endif /* !NO_AES_192 */
         "b	L_aes_ctr_encrypt_arm64_crypto_done_%=\n\t"
         /* AES_CTR_256 */
         "\n"
-    "L_aes_ctr_encrypt_arm64_crypto_start_256_%=: \n\t"
+    "L_aes_ctr_encrypt_arm64_crypto_start_256_%=:\n\t"
 #ifndef NO_AES_256
         "ld1	{v8.2d, v9.2d, v10.2d, v11.2d}, [%x[key]], #0x40\n\t"
         "ld1	{v12.2d, v13.2d}, [%x[key]], #32\n\t"
@@ -4108,7 +4114,7 @@ void AES_CTR_encrypt_AARCH64(const byte* in, byte* out, word32 sz, byte* reg,
         "adds	x15, x9, #3\n\t"
         "adc	x16, x10, xzr\n\t"
         "\n"
-    "L_aes_ctr_encrypt_arm64_crypto_256_start_8_%=: \n\t"
+    "L_aes_ctr_encrypt_arm64_crypto_256_start_8_%=:\n\t"
         "ld1	{v24.16b, v25.16b, v26.16b, v27.16b}, [%x[in]], #0x40\n\t"
         "ld1	{v28.16b, v29.16b, v30.16b, v31.16b}, [%x[in]], #0x40\n\t"
         "mov	v17.d[0], x12\n\t"
@@ -4389,7 +4395,7 @@ void AES_CTR_encrypt_AARCH64(const byte* in, byte* out, word32 sz, byte* reg,
         "cmp	w8, #8\n\t"
         "b.ge	L_aes_ctr_encrypt_arm64_crypto_256_start_8_%=\n\t"
         "\n"
-    "L_aes_ctr_encrypt_arm64_crypto_256_start_4_%=: \n\t"
+    "L_aes_ctr_encrypt_arm64_crypto_256_start_4_%=:\n\t"
         "cmp	w8, #4\n\t"
         "b.lt	L_aes_ctr_encrypt_arm64_crypto_256_start_2_%=\n\t"
         "ld1	{v24.16b, v25.16b, v26.16b, v27.16b}, [%x[in]], #0x40\n\t"
@@ -4532,7 +4538,7 @@ void AES_CTR_encrypt_AARCH64(const byte* in, byte* out, word32 sz, byte* reg,
         "mov	v16.d[0], x10\n\t"
         "mov	v16.d[1], x9\n\t"
         "\n"
-    "L_aes_ctr_encrypt_arm64_crypto_256_start_2_%=: \n\t"
+    "L_aes_ctr_encrypt_arm64_crypto_256_start_2_%=:\n\t"
         "cmp	w8, #2\n\t"
         "b.lt	L_aes_ctr_encrypt_arm64_crypto_256_start_1_%=\n\t"
         "ld1	{v24.16b, v25.16b}, [%x[in]], #32\n\t"
@@ -4612,7 +4618,7 @@ void AES_CTR_encrypt_AARCH64(const byte* in, byte* out, word32 sz, byte* reg,
         "mov	v16.d[0], x10\n\t"
         "mov	v16.d[1], x9\n\t"
         "\n"
-    "L_aes_ctr_encrypt_arm64_crypto_256_start_1_%=: \n\t"
+    "L_aes_ctr_encrypt_arm64_crypto_256_start_1_%=:\n\t"
         "cbz	w8, L_aes_ctr_encrypt_arm64_crypto_256_done_%=\n\t"
         "ld1	{v24.16b}, [%x[in]], #16\n\t"
         "rev64	v16.16b, v16.16b\n\t"
@@ -4649,7 +4655,7 @@ void AES_CTR_encrypt_AARCH64(const byte* in, byte* out, word32 sz, byte* reg,
         "adc	x10, x10, xzr\n\t"
         "st1	{v24.16b}, [%x[out]], #16\n\t"
         "\n"
-    "L_aes_ctr_encrypt_arm64_crypto_256_done_%=: \n\t"
+    "L_aes_ctr_encrypt_arm64_crypto_256_done_%=:\n\t"
         "cbz	%w[sz], L_aes_ctr_encrypt_arm64_crypto_256_partial_done_%=\n\t"
         "mov	v16.d[0], x10\n\t"
         "mov	v16.d[1], x9\n\t"
@@ -4688,7 +4694,7 @@ void AES_CTR_encrypt_AARCH64(const byte* in, byte* out, word32 sz, byte* reg,
         "mov	w13, #16\n\t"
         "sub	w13, w13, %w[sz]\n\t"
         "\n"
-    "L_aes_ctr_encrypt_arm64_crypto_256_start_byte_%=: \n\t"
+    "L_aes_ctr_encrypt_arm64_crypto_256_start_byte_%=:\n\t"
         "ldrb	w11, [%x[tmp]], #1\n\t"
         "ldrb	w12, [%x[in]], #1\n\t"
         "eor	w11, w11, w12\n\t"
@@ -4697,12 +4703,12 @@ void AES_CTR_encrypt_AARCH64(const byte* in, byte* out, word32 sz, byte* reg,
         "b.gt	L_aes_ctr_encrypt_arm64_crypto_256_start_byte_%=\n\t"
         "str	w13, [%x[left]]\n\t"
         "\n"
-    "L_aes_ctr_encrypt_arm64_crypto_256_partial_done_%=: \n\t"
+    "L_aes_ctr_encrypt_arm64_crypto_256_partial_done_%=:\n\t"
 #endif /* !NO_AES_256 */
         "b	L_aes_ctr_encrypt_arm64_crypto_done_%=\n\t"
         /* AES_CTR_128 */
         "\n"
-    "L_aes_ctr_encrypt_arm64_crypto_start_128_%=: \n\t"
+    "L_aes_ctr_encrypt_arm64_crypto_start_128_%=:\n\t"
 #ifndef NO_AES_128
         "ld1	{v8.2d, v9.2d}, [%x[key]], #32\n\t"
         "ld1	{v10.2d}, [%x[key]]\n\t"
@@ -4717,7 +4723,7 @@ void AES_CTR_encrypt_AARCH64(const byte* in, byte* out, word32 sz, byte* reg,
         "adds	x15, x9, #3\n\t"
         "adc	x16, x10, xzr\n\t"
         "\n"
-    "L_aes_ctr_encrypt_arm64_crypto_128_start_8_%=: \n\t"
+    "L_aes_ctr_encrypt_arm64_crypto_128_start_8_%=:\n\t"
         "ld1	{v24.16b, v25.16b, v26.16b, v27.16b}, [%x[in]], #0x40\n\t"
         "ld1	{v28.16b, v29.16b, v30.16b, v31.16b}, [%x[in]], #0x40\n\t"
         "mov	v17.d[0], x12\n\t"
@@ -4934,7 +4940,7 @@ void AES_CTR_encrypt_AARCH64(const byte* in, byte* out, word32 sz, byte* reg,
         "cmp	w8, #8\n\t"
         "b.ge	L_aes_ctr_encrypt_arm64_crypto_128_start_8_%=\n\t"
         "\n"
-    "L_aes_ctr_encrypt_arm64_crypto_128_start_4_%=: \n\t"
+    "L_aes_ctr_encrypt_arm64_crypto_128_start_4_%=:\n\t"
         "cmp	w8, #4\n\t"
         "b.lt	L_aes_ctr_encrypt_arm64_crypto_128_start_2_%=\n\t"
         "ld1	{v24.16b, v25.16b, v26.16b, v27.16b}, [%x[in]], #0x40\n\t"
@@ -5045,7 +5051,7 @@ void AES_CTR_encrypt_AARCH64(const byte* in, byte* out, word32 sz, byte* reg,
         "mov	v16.d[0], x10\n\t"
         "mov	v16.d[1], x9\n\t"
         "\n"
-    "L_aes_ctr_encrypt_arm64_crypto_128_start_2_%=: \n\t"
+    "L_aes_ctr_encrypt_arm64_crypto_128_start_2_%=:\n\t"
         "cmp	w8, #2\n\t"
         "b.lt	L_aes_ctr_encrypt_arm64_crypto_128_start_1_%=\n\t"
         "ld1	{v24.16b, v25.16b}, [%x[in]], #32\n\t"
@@ -5109,7 +5115,7 @@ void AES_CTR_encrypt_AARCH64(const byte* in, byte* out, word32 sz, byte* reg,
         "mov	v16.d[0], x10\n\t"
         "mov	v16.d[1], x9\n\t"
         "\n"
-    "L_aes_ctr_encrypt_arm64_crypto_128_start_1_%=: \n\t"
+    "L_aes_ctr_encrypt_arm64_crypto_128_start_1_%=:\n\t"
         "cbz	w8, L_aes_ctr_encrypt_arm64_crypto_128_done_%=\n\t"
         "ld1	{v24.16b}, [%x[in]], #16\n\t"
         "rev64	v16.16b, v16.16b\n\t"
@@ -5138,7 +5144,7 @@ void AES_CTR_encrypt_AARCH64(const byte* in, byte* out, word32 sz, byte* reg,
         "adc	x10, x10, xzr\n\t"
         "st1	{v24.16b}, [%x[out]], #16\n\t"
         "\n"
-    "L_aes_ctr_encrypt_arm64_crypto_128_done_%=: \n\t"
+    "L_aes_ctr_encrypt_arm64_crypto_128_done_%=:\n\t"
         "cbz	%w[sz], L_aes_ctr_encrypt_arm64_crypto_128_partial_done_%=\n\t"
         "mov	v16.d[0], x10\n\t"
         "mov	v16.d[1], x9\n\t"
@@ -5169,7 +5175,7 @@ void AES_CTR_encrypt_AARCH64(const byte* in, byte* out, word32 sz, byte* reg,
         "mov	w13, #16\n\t"
         "sub	w13, w13, %w[sz]\n\t"
         "\n"
-    "L_aes_ctr_encrypt_arm64_crypto_128_start_byte_%=: \n\t"
+    "L_aes_ctr_encrypt_arm64_crypto_128_start_byte_%=:\n\t"
         "ldrb	w11, [%x[tmp]], #1\n\t"
         "ldrb	w12, [%x[in]], #1\n\t"
         "eor	w11, w11, w12\n\t"
@@ -5178,10 +5184,10 @@ void AES_CTR_encrypt_AARCH64(const byte* in, byte* out, word32 sz, byte* reg,
         "b.gt	L_aes_ctr_encrypt_arm64_crypto_128_start_byte_%=\n\t"
         "str	w13, [%x[left]]\n\t"
         "\n"
-    "L_aes_ctr_encrypt_arm64_crypto_128_partial_done_%=: \n\t"
+    "L_aes_ctr_encrypt_arm64_crypto_128_partial_done_%=:\n\t"
 #endif /* !NO_AES_128 */
         "\n"
-    "L_aes_ctr_encrypt_arm64_crypto_done_%=: \n\t"
+    "L_aes_ctr_encrypt_arm64_crypto_done_%=:\n\t"
         "rev	x11, x10\n\t"
         "rev	x12, x9\n\t"
         "stp	x11, x12, [%x[reg]]\n\t"
@@ -5242,7 +5248,7 @@ void AES_GCM_set_key_AARCH64(const byte* nonce, const byte* key, byte* gcm_h,
         "aesmc	v0.16b, v0.16b\n\t"
         "aese	v0.16b, v2.16b\n\t"
         "\n"
-    "L_aes_gcm_set_key_arm64_crypto_round_done_%=: \n\t"
+    "L_aes_gcm_set_key_arm64_crypto_round_done_%=:\n\t"
         "ld1	{v1.2d}, [%x[key]]\n\t"
         "eor	v0.16b, v0.16b, v1.16b\n\t"
         "rbit	v0.16b, v0.16b\n\t"
@@ -5372,7 +5378,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "eor	v7.16b, v30.16b, v31.16b\n\t"
         /* Done */
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_h_done_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_h_done_%=:\n\t"
         "lsr	w14, w8, #4\n\t"
         "cmp	w14, #4\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_aad_start_1_%=\n\t"
@@ -5381,7 +5387,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "cmp	w14, #0x40\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_aad_start_4_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_aad_start_8_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_aad_start_8_%=:\n\t"
         "ld1	{v18.16b, v19.16b, v20.16b, v21.16b}, [%x[aad]], #0x40\n\t"
         "ld1	{v0.16b, v1.16b, v2.16b, v3.16b}, [%x[aad]], #0x40\n\t"
         "rbit	v18.16b, v18.16b\n\t"
@@ -5488,7 +5494,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "cmp	w14, #16\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_aad_start_2_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_aad_start_4_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_aad_start_4_%=:\n\t"
         "ld1	{v18.16b, v19.16b, v20.16b, v21.16b}, [%x[aad]], #0x40\n\t"
         "rbit	v18.16b, v18.16b\n\t"
         "rbit	v19.16b, v19.16b\n\t"
@@ -5548,7 +5554,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_aad_done_%=\n\t"
         "b.eq	L_aes_gcm_encrypt_arm64_crypto_aad_start_1_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_aad_start_2_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_aad_start_2_%=:\n\t"
         "ld1	{v18.16b, v19.16b}, [%x[aad]], #32\n\t"
         "rbit	v18.16b, v18.16b\n\t"
         "rbit	v19.16b, v19.16b\n\t"
@@ -5584,10 +5590,10 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "b.gt	L_aes_gcm_encrypt_arm64_crypto_aad_start_2_%=\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_aad_done_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_aad_start_1_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_aad_start_1_%=:\n\t"
         "cbz	w14, L_aes_gcm_encrypt_arm64_crypto_aad_done_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_aad_both_1_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_aad_both_1_%=:\n\t"
         "ld1	{v18.16b}, [%x[aad]], #16\n\t"
         "rbit	v18.16b, v18.16b\n\t"
         "eor	v21.16b, v26.16b, v18.16b\n\t"
@@ -5610,7 +5616,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "subs	w14, w14, #1\n\t"
         "b.ne	L_aes_gcm_encrypt_arm64_crypto_aad_both_1_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_aad_done_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_aad_done_%=:\n\t"
         "and	w14, w8, #15\n\t"
         "cbz	w14, L_aes_gcm_encrypt_arm64_crypto_aad_partial_done_%=\n\t"
         "eor	v28.16b, v28.16b, v28.16b\n\t"
@@ -5622,28 +5628,28 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "sub	w20, w20, #8\n\t"
         "str	x19, [x11], #8\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_aad_start_dw_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_aad_start_dw_%=:\n\t"
         "cmp	w20, #4\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_aad_start_sw_%=\n\t"
         "ldr	w19, [%x[aad]], #4\n\t"
         "sub	w20, w20, #4\n\t"
         "str	w19, [x11], #4\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_aad_start_sw_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_aad_start_sw_%=:\n\t"
         "cmp	w20, #2\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_aad_start_byte_%=\n\t"
         "ldrh	w19, [%x[aad]], #2\n\t"
         "sub	w20, w20, #2\n\t"
         "strh	w19, [x11], #2\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_aad_start_byte_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_aad_start_byte_%=:\n\t"
         "cbz	w20, L_aes_gcm_encrypt_arm64_crypto_aad_end_bytes_%=\n\t"
         "ldrb	w19, [%x[aad]], #1\n\t"
         "subs	w20, w20, #1\n\t"
         "strb	w19, [x11], #1\n\t"
         "b.ne	L_aes_gcm_encrypt_arm64_crypto_aad_start_byte_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_aad_end_bytes_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_aad_end_bytes_%=:\n\t"
         "sub	x11, x11, x14\n\t"
         "ld1	{v18.2d}, [x11]\n\t"
         "rbit	v18.16b, v18.16b\n\t"
@@ -5665,7 +5671,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "eor	v26.16b, v28.16b, v30.16b\n\t"
         /* Done GHASH */
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_aad_partial_done_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_aad_partial_done_%=:\n\t"
         /* Load Nonce */
         "cmp	%w[nonceSz], #12\n\t"
         "b.ne	L_aes_gcm_encrypt_arm64_crypto_ghash_nonce_%=\n\t"
@@ -5677,12 +5683,12 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "mov	w15, #1\n\t"
         "b	L_aes_gcm_encrypt_arm64_crypto_done_nonce_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_ghash_nonce_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_ghash_nonce_%=:\n\t"
         "eor	v13.16b, v13.16b, v13.16b\n\t"
         "lsr	w14, %w[nonceSz], #4\n\t"
         "cbz	w14, L_aes_gcm_encrypt_arm64_crypto_nonce_done_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_nonce_start_1_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_nonce_start_1_%=:\n\t"
         "ld1	{v18.16b}, [%x[nonce]], #16\n\t"
         "rbit	v18.16b, v18.16b\n\t"
         "eor	v21.16b, v13.16b, v18.16b\n\t"
@@ -5705,7 +5711,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "subs	w14, w14, #1\n\t"
         "b.ne	L_aes_gcm_encrypt_arm64_crypto_nonce_start_1_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_nonce_done_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_nonce_done_%=:\n\t"
         "and	w24, %w[nonceSz], #15\n\t"
         "cbz	x24, L_aes_gcm_encrypt_arm64_crypto_nonce_partial_done_%=\n\t"
         "eor	v28.16b, v28.16b, v28.16b\n\t"
@@ -5717,28 +5723,28 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "sub	w20, w20, #8\n\t"
         "str	x19, [x11], #8\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_nonce_start_dw_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_nonce_start_dw_%=:\n\t"
         "cmp	w20, #4\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_nonce_start_sw_%=\n\t"
         "ldr	w19, [%x[nonce]], #4\n\t"
         "sub	w20, w20, #4\n\t"
         "str	w19, [x11], #4\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_nonce_start_sw_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_nonce_start_sw_%=:\n\t"
         "cmp	w20, #2\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_nonce_start_byte_%=\n\t"
         "ldrh	w19, [%x[nonce]], #2\n\t"
         "sub	w20, w20, #2\n\t"
         "strh	w19, [x11], #2\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_nonce_start_byte_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_nonce_start_byte_%=:\n\t"
         "cbz	w20, L_aes_gcm_encrypt_arm64_crypto_nonce_end_bytes_%=\n\t"
         "ldrb	w19, [%x[nonce]], #1\n\t"
         "subs	w20, w20, #1\n\t"
         "strb	w19, [x11], #1\n\t"
         "b.ne	L_aes_gcm_encrypt_arm64_crypto_nonce_start_byte_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_nonce_end_bytes_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_nonce_end_bytes_%=:\n\t"
         "sub	x11, x11, x24\n\t"
         "ld1	{v18.2d}, [x11]\n\t"
         "rbit	v18.16b, v18.16b\n\t"
@@ -5760,7 +5766,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "eor	v13.16b, v28.16b, v30.16b\n\t"
         /* Done GHASH */
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_nonce_partial_done_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_nonce_partial_done_%=:\n\t"
         "eor	x14, x14, x14\n\t"
         "lsl	x24, %x[nonceSz], #3\n\t"
         "mov	v28.d[0], x14\n\t"
@@ -5785,7 +5791,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "mov	w15, v13.s[3]\n\t"
         "rev	w15, w15\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_done_nonce_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_done_nonce_%=:\n\t"
         "st1	{v13.2d}, [x12]\n\t"
         "lsr	w14, %w[sz], #4\n\t"
         "cmp	w13, #12\n\t"
@@ -5796,7 +5802,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "cmp	w14, #32\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_192_start_4_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_192_start_8_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_192_start_8_%=:\n\t"
         "ldr	q12, [x9]\n\t"
         "add	w24, w15, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
@@ -6057,7 +6063,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "cmp	w14, #8\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_192_end_8_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_192_both_8_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_192_both_8_%=:\n\t"
         "ldr	q12, [x9]\n\t"
         "add	w24, w15, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
@@ -6413,7 +6419,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "cmp	w14, #8\n\t"
         "b.ge	L_aes_gcm_encrypt_arm64_crypto_192_both_8_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_192_end_8_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_192_end_8_%=:\n\t"
         "rbit	v18.16b, v18.16b\n\t"
         "rbit	v19.16b, v19.16b\n\t"
         "rbit	v20.16b, v20.16b\n\t"
@@ -6510,7 +6516,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "eor	v26.16b, v28.16b, v30.16b\n\t"
         /* Done GHASH */
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_192_start_4_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_192_start_4_%=:\n\t"
         "ld1	{v0.2d, v1.2d, v2.2d, v3.2d}, [x9], #0x40\n\t"
         "ld1	{v4.2d, v5.2d, v6.2d, v7.2d}, [x9], #0x40\n\t"
         "ld1	{v8.2d, v9.2d, v10.2d, v11.2d}, [x9], #0x40\n\t"
@@ -6645,7 +6651,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "st1	{v18.16b, v19.16b, v20.16b, v21.16b}, [%x[out]], #0x40\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_192_end_4_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_192_both_4_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_192_both_4_%=:\n\t"
         "add	w20, w15, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
         "add	w19, w15, #2\n\t"
@@ -6819,7 +6825,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "st1	{v18.16b, v19.16b, v20.16b, v21.16b}, [%x[out]], #0x40\n\t"
         "b.ge	L_aes_gcm_encrypt_arm64_crypto_192_both_4_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_192_end_4_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_192_end_4_%=:\n\t"
         "rbit	v18.16b, v18.16b\n\t"
         "rbit	v19.16b, v19.16b\n\t"
         "rbit	v20.16b, v20.16b\n\t"
@@ -6875,7 +6881,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "b.eq	L_aes_gcm_encrypt_arm64_crypto_192_start_1_%=\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_192_done_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_192_start_2_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_192_start_2_%=:\n\t"
         "add	w20, w15, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
         "add	w15, w15, #2\n\t"
@@ -6969,7 +6975,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz,
         /* Done GHASH */
         "cbz	w14, L_aes_gcm_encrypt_arm64_crypto_192_done_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_192_start_1_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_192_start_1_%=:\n\t"
         "add	w15, w15, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
         "rev	w16, w15\n\t"
@@ -7020,7 +7026,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "eor	v26.16b, v28.16b, v30.16b\n\t"
         /* Done GHASH */
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_192_done_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_192_done_%=:\n\t"
         "ands	w14, %w[sz], #15\n\t"
         "b.eq	L_aes_gcm_encrypt_arm64_crypto_192_partial_done_%=\n\t"
         "eor	v16.16b, v16.16b, v16.16b\n\t"
@@ -7032,28 +7038,28 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "sub	x19, x19, #8\n\t"
         "str	x17, [x11], #8\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_192_start_dw_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_192_start_dw_%=:\n\t"
         "cmp	x19, #4\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_192_start_sw_%=\n\t"
         "ldr	w17, [%x[in]], #4\n\t"
         "sub	x19, x19, #4\n\t"
         "str	w17, [x11], #4\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_192_start_sw_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_192_start_sw_%=:\n\t"
         "cmp	x19, #2\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_192_start_byte_%=\n\t"
         "ldrh	w17, [%x[in]], #2\n\t"
         "sub	x19, x19, #2\n\t"
         "strh	w17, [x11], #2\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_192_start_byte_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_192_start_byte_%=:\n\t"
         "cbz	x19, L_aes_gcm_encrypt_arm64_crypto_192_end_bytes_%=\n\t"
         "ldrb	w17, [%x[in]], #1\n\t"
         "subs	x19, x19, #1\n\t"
         "strb	w17, [x11], #1\n\t"
         "b.ne	L_aes_gcm_encrypt_arm64_crypto_192_start_byte_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_192_end_bytes_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_192_end_bytes_%=:\n\t"
         "sub	x11, x11, x14\n\t"
         "ld1	{v16.2d}, [x11]\n\t"
         "add	w15, w15, #1\n\t"
@@ -7093,32 +7099,32 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "sub	x19, x19, #8\n\t"
         "str	x17, [%x[out]], #8\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_192_out_start_dw_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_192_out_start_dw_%=:\n\t"
         "cmp	x19, #4\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_192_out_start_sw_%=\n\t"
         "ldr	w17, [x11], #4\n\t"
         "sub	x19, x19, #4\n\t"
         "str	w17, [%x[out]], #4\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_192_out_start_sw_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_192_out_start_sw_%=:\n\t"
         "cmp	x19, #2\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_192_out_start_byte_%=\n\t"
         "ldrh	w17, [x11], #2\n\t"
         "sub	x19, x19, #2\n\t"
         "strh	w17, [%x[out]], #2\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_192_out_start_byte_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_192_out_start_byte_%=:\n\t"
         "cbz	x19, L_aes_gcm_encrypt_arm64_crypto_192_out_end_bytes_%=\n\t"
         "ldrb	w17, [x11], #1\n\t"
         "subs	x19, x19, #1\n\t"
         "strb	w17, [%x[out]], #1\n\t"
         "b.ne	L_aes_gcm_encrypt_arm64_crypto_192_out_start_byte_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_192_out_end_bytes_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_192_out_end_bytes_%=:\n\t"
         "mov	x17, #16\n\t"
         "sub	x17, x17, x14\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_192_start_zero_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_192_start_zero_%=:\n\t"
         "subs	x17, x17, #1\n\t"
         "strb	wzr, [x11], #1\n\t"
         "b.ne	L_aes_gcm_encrypt_arm64_crypto_192_start_zero_%=\n\t"
@@ -7143,7 +7149,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "eor	v26.16b, v28.16b, v30.16b\n\t"
         /* Done GHASH */
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_192_partial_done_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_192_partial_done_%=:\n\t"
         "ld1	{v14.2d}, [x12]\n\t"
         "lsl	x8, x8, #3\n\t"
         "rbit	x8, x8\n\t"
@@ -7197,7 +7203,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "st1	{v26.16b}, [%x[tag]]\n\t"
         "b	L_aes_gcm_encrypt_arm64_crypto_done_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_192_tag_partial_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_192_tag_partial_%=:\n\t"
         "st1	{v26.16b}, [x11]\n\t"
         "cmp	%w[tagSz], #8\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_192_tag_start_dw_%=\n\t"
@@ -7205,38 +7211,38 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "sub	%w[tagSz], %w[tagSz], #8\n\t"
         "str	x16, [%x[tag]], #8\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_192_tag_start_dw_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_192_tag_start_dw_%=:\n\t"
         "cmp	%w[tagSz], #4\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_192_tag_start_sw_%=\n\t"
         "ldr	w16, [x11], #4\n\t"
         "sub	%w[tagSz], %w[tagSz], #4\n\t"
         "str	w16, [%x[tag]], #4\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_192_tag_start_sw_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_192_tag_start_sw_%=:\n\t"
         "cmp	%w[tagSz], #2\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_192_tag_start_byte_%=\n\t"
         "ldrh	w16, [x11], #2\n\t"
         "sub	%w[tagSz], %w[tagSz], #2\n\t"
         "strh	w16, [%x[tag]], #2\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_192_tag_start_byte_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_192_tag_start_byte_%=:\n\t"
         "cbz	%w[tagSz], L_aes_gcm_encrypt_arm64_crypto_192_tag_end_bytes_%=\n\t"
         "ldrb	w16, [x11], #1\n\t"
         "subs	%w[tagSz], %w[tagSz], #1\n\t"
         "strb	w16, [%x[tag]], #1\n\t"
         "b.ne	L_aes_gcm_encrypt_arm64_crypto_192_tag_start_byte_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_192_tag_end_bytes_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_192_tag_end_bytes_%=:\n\t"
 #endif /* !NO_AES_192 */
         "b	L_aes_gcm_encrypt_arm64_crypto_done_%=\n\t"
         /* AES_GCM_256 */
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_start_256_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_start_256_%=:\n\t"
 #ifndef NO_AES_256
         "cmp	w14, #32\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_256_start_4_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_256_start_8_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_256_start_8_%=:\n\t"
         "ldr	q12, [x9]\n\t"
         "add	w24, w15, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
@@ -7531,7 +7537,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "cmp	w14, #8\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_256_end_8_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_256_both_8_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_256_both_8_%=:\n\t"
         "ldr	q12, [x9]\n\t"
         "add	w24, w15, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
@@ -7921,7 +7927,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "cmp	w14, #8\n\t"
         "b.ge	L_aes_gcm_encrypt_arm64_crypto_256_both_8_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_256_end_8_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_256_end_8_%=:\n\t"
         "rbit	v18.16b, v18.16b\n\t"
         "rbit	v19.16b, v19.16b\n\t"
         "rbit	v20.16b, v20.16b\n\t"
@@ -8018,7 +8024,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "eor	v26.16b, v28.16b, v30.16b\n\t"
         /* Done GHASH */
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_256_start_4_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_256_start_4_%=:\n\t"
         "ld1	{v0.2d, v1.2d, v2.2d, v3.2d}, [x9], #0x40\n\t"
         "ld1	{v4.2d, v5.2d, v6.2d, v7.2d}, [x9], #0x40\n\t"
         "ld1	{v8.2d, v9.2d, v10.2d, v11.2d}, [x9], #0x40\n\t"
@@ -8170,7 +8176,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "st1	{v18.16b, v19.16b, v20.16b, v21.16b}, [%x[out]], #0x40\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_256_end_4_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_256_both_4_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_256_both_4_%=:\n\t"
         "add	w20, w15, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
         "add	w19, w15, #2\n\t"
@@ -8361,7 +8367,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "st1	{v18.16b, v19.16b, v20.16b, v21.16b}, [%x[out]], #0x40\n\t"
         "b.ge	L_aes_gcm_encrypt_arm64_crypto_256_both_4_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_256_end_4_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_256_end_4_%=:\n\t"
         "rbit	v18.16b, v18.16b\n\t"
         "rbit	v19.16b, v19.16b\n\t"
         "rbit	v20.16b, v20.16b\n\t"
@@ -8417,7 +8423,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "b.eq	L_aes_gcm_encrypt_arm64_crypto_256_start_1_%=\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_256_done_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_256_start_2_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_256_start_2_%=:\n\t"
         "add	w20, w15, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
         "add	w15, w15, #2\n\t"
@@ -8520,7 +8526,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz,
         /* Done GHASH */
         "cbz	w14, L_aes_gcm_encrypt_arm64_crypto_256_done_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_256_start_1_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_256_start_1_%=:\n\t"
         "add	w15, w15, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
         "rev	w16, w15\n\t"
@@ -8577,7 +8583,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "eor	v26.16b, v28.16b, v30.16b\n\t"
         /* Done GHASH */
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_256_done_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_256_done_%=:\n\t"
         "ands	w14, %w[sz], #15\n\t"
         "b.eq	L_aes_gcm_encrypt_arm64_crypto_256_partial_done_%=\n\t"
         "eor	v16.16b, v16.16b, v16.16b\n\t"
@@ -8589,28 +8595,28 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "sub	x19, x19, #8\n\t"
         "str	x17, [x11], #8\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_256_start_dw_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_256_start_dw_%=:\n\t"
         "cmp	x19, #4\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_256_start_sw_%=\n\t"
         "ldr	w17, [%x[in]], #4\n\t"
         "sub	x19, x19, #4\n\t"
         "str	w17, [x11], #4\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_256_start_sw_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_256_start_sw_%=:\n\t"
         "cmp	x19, #2\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_256_start_byte_%=\n\t"
         "ldrh	w17, [%x[in]], #2\n\t"
         "sub	x19, x19, #2\n\t"
         "strh	w17, [x11], #2\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_256_start_byte_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_256_start_byte_%=:\n\t"
         "cbz	x19, L_aes_gcm_encrypt_arm64_crypto_256_end_bytes_%=\n\t"
         "ldrb	w17, [%x[in]], #1\n\t"
         "subs	x19, x19, #1\n\t"
         "strb	w17, [x11], #1\n\t"
         "b.ne	L_aes_gcm_encrypt_arm64_crypto_256_start_byte_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_256_end_bytes_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_256_end_bytes_%=:\n\t"
         "sub	x11, x11, x14\n\t"
         "ld1	{v16.2d}, [x11]\n\t"
         "add	w15, w15, #1\n\t"
@@ -8656,32 +8662,32 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "sub	x19, x19, #8\n\t"
         "str	x17, [%x[out]], #8\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_256_out_start_dw_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_256_out_start_dw_%=:\n\t"
         "cmp	x19, #4\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_256_out_start_sw_%=\n\t"
         "ldr	w17, [x11], #4\n\t"
         "sub	x19, x19, #4\n\t"
         "str	w17, [%x[out]], #4\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_256_out_start_sw_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_256_out_start_sw_%=:\n\t"
         "cmp	x19, #2\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_256_out_start_byte_%=\n\t"
         "ldrh	w17, [x11], #2\n\t"
         "sub	x19, x19, #2\n\t"
         "strh	w17, [%x[out]], #2\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_256_out_start_byte_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_256_out_start_byte_%=:\n\t"
         "cbz	x19, L_aes_gcm_encrypt_arm64_crypto_256_out_end_bytes_%=\n\t"
         "ldrb	w17, [x11], #1\n\t"
         "subs	x19, x19, #1\n\t"
         "strb	w17, [%x[out]], #1\n\t"
         "b.ne	L_aes_gcm_encrypt_arm64_crypto_256_out_start_byte_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_256_out_end_bytes_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_256_out_end_bytes_%=:\n\t"
         "mov	x17, #16\n\t"
         "sub	x17, x17, x14\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_256_start_zero_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_256_start_zero_%=:\n\t"
         "subs	x17, x17, #1\n\t"
         "strb	wzr, [x11], #1\n\t"
         "b.ne	L_aes_gcm_encrypt_arm64_crypto_256_start_zero_%=\n\t"
@@ -8706,7 +8712,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "eor	v26.16b, v28.16b, v30.16b\n\t"
         /* Done GHASH */
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_256_partial_done_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_256_partial_done_%=:\n\t"
         "ld1	{v14.2d}, [x12]\n\t"
         "lsl	x8, x8, #3\n\t"
         "rbit	x8, x8\n\t"
@@ -8768,7 +8774,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "st1	{v26.16b}, [%x[tag]]\n\t"
         "b	L_aes_gcm_encrypt_arm64_crypto_done_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_256_tag_partial_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_256_tag_partial_%=:\n\t"
         "st1	{v26.16b}, [x11]\n\t"
         "cmp	%w[tagSz], #8\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_256_tag_start_dw_%=\n\t"
@@ -8776,38 +8782,38 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "sub	%w[tagSz], %w[tagSz], #8\n\t"
         "str	x16, [%x[tag]], #8\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_256_tag_start_dw_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_256_tag_start_dw_%=:\n\t"
         "cmp	%w[tagSz], #4\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_256_tag_start_sw_%=\n\t"
         "ldr	w16, [x11], #4\n\t"
         "sub	%w[tagSz], %w[tagSz], #4\n\t"
         "str	w16, [%x[tag]], #4\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_256_tag_start_sw_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_256_tag_start_sw_%=:\n\t"
         "cmp	%w[tagSz], #2\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_256_tag_start_byte_%=\n\t"
         "ldrh	w16, [x11], #2\n\t"
         "sub	%w[tagSz], %w[tagSz], #2\n\t"
         "strh	w16, [%x[tag]], #2\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_256_tag_start_byte_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_256_tag_start_byte_%=:\n\t"
         "cbz	%w[tagSz], L_aes_gcm_encrypt_arm64_crypto_256_tag_end_bytes_%=\n\t"
         "ldrb	w16, [x11], #1\n\t"
         "subs	%w[tagSz], %w[tagSz], #1\n\t"
         "strb	w16, [%x[tag]], #1\n\t"
         "b.ne	L_aes_gcm_encrypt_arm64_crypto_256_tag_start_byte_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_256_tag_end_bytes_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_256_tag_end_bytes_%=:\n\t"
 #endif /* !NO_AES_256 */
         "b	L_aes_gcm_encrypt_arm64_crypto_done_%=\n\t"
         /* AES_GCM_128 */
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_start_128_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_start_128_%=:\n\t"
 #ifndef NO_AES_128
         "cmp	w14, #32\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_128_start_4_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_128_start_8_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_128_start_8_%=:\n\t"
         "ldr	q12, [x9]\n\t"
         "add	w24, w15, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
@@ -9034,7 +9040,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "cmp	w14, #8\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_128_end_8_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_128_both_8_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_128_both_8_%=:\n\t"
         "ldr	q12, [x9]\n\t"
         "add	w24, w15, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
@@ -9356,7 +9362,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "cmp	w14, #8\n\t"
         "b.ge	L_aes_gcm_encrypt_arm64_crypto_128_both_8_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_128_end_8_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_128_end_8_%=:\n\t"
         "rbit	v18.16b, v18.16b\n\t"
         "rbit	v19.16b, v19.16b\n\t"
         "rbit	v20.16b, v20.16b\n\t"
@@ -9453,7 +9459,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "eor	v26.16b, v28.16b, v30.16b\n\t"
         /* Done GHASH */
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_128_start_4_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_128_start_4_%=:\n\t"
         "ld1	{v0.2d, v1.2d, v2.2d, v3.2d}, [x9], #0x40\n\t"
         "ld1	{v4.2d, v5.2d, v6.2d, v7.2d}, [x9], #0x40\n\t"
         "ld1	{v8.2d, v9.2d}, [x9], #32\n\t"
@@ -9572,7 +9578,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "st1	{v18.16b, v19.16b, v20.16b, v21.16b}, [%x[out]], #0x40\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_128_end_4_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_128_both_4_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_128_both_4_%=:\n\t"
         "add	w20, w15, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
         "add	w19, w15, #2\n\t"
@@ -9730,7 +9736,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "st1	{v18.16b, v19.16b, v20.16b, v21.16b}, [%x[out]], #0x40\n\t"
         "b.ge	L_aes_gcm_encrypt_arm64_crypto_128_both_4_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_128_end_4_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_128_end_4_%=:\n\t"
         "rbit	v18.16b, v18.16b\n\t"
         "rbit	v19.16b, v19.16b\n\t"
         "rbit	v20.16b, v20.16b\n\t"
@@ -9786,7 +9792,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "b.eq	L_aes_gcm_encrypt_arm64_crypto_128_start_1_%=\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_128_done_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_128_start_2_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_128_start_2_%=:\n\t"
         "add	w20, w15, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
         "add	w15, w15, #2\n\t"
@@ -9872,7 +9878,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz,
         /* Done GHASH */
         "cbz	w14, L_aes_gcm_encrypt_arm64_crypto_128_done_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_128_start_1_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_128_start_1_%=:\n\t"
         "add	w15, w15, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
         "rev	w16, w15\n\t"
@@ -9919,7 +9925,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "eor	v26.16b, v28.16b, v30.16b\n\t"
         /* Done GHASH */
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_128_done_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_128_done_%=:\n\t"
         "ands	w14, %w[sz], #15\n\t"
         "b.eq	L_aes_gcm_encrypt_arm64_crypto_128_partial_done_%=\n\t"
         "eor	v16.16b, v16.16b, v16.16b\n\t"
@@ -9931,28 +9937,28 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "sub	x19, x19, #8\n\t"
         "str	x17, [x11], #8\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_128_start_dw_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_128_start_dw_%=:\n\t"
         "cmp	x19, #4\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_128_start_sw_%=\n\t"
         "ldr	w17, [%x[in]], #4\n\t"
         "sub	x19, x19, #4\n\t"
         "str	w17, [x11], #4\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_128_start_sw_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_128_start_sw_%=:\n\t"
         "cmp	x19, #2\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_128_start_byte_%=\n\t"
         "ldrh	w17, [%x[in]], #2\n\t"
         "sub	x19, x19, #2\n\t"
         "strh	w17, [x11], #2\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_128_start_byte_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_128_start_byte_%=:\n\t"
         "cbz	x19, L_aes_gcm_encrypt_arm64_crypto_128_end_bytes_%=\n\t"
         "ldrb	w17, [%x[in]], #1\n\t"
         "subs	x19, x19, #1\n\t"
         "strb	w17, [x11], #1\n\t"
         "b.ne	L_aes_gcm_encrypt_arm64_crypto_128_start_byte_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_128_end_bytes_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_128_end_bytes_%=:\n\t"
         "sub	x11, x11, x14\n\t"
         "ld1	{v16.2d}, [x11]\n\t"
         "add	w15, w15, #1\n\t"
@@ -9988,32 +9994,32 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "sub	x19, x19, #8\n\t"
         "str	x17, [%x[out]], #8\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_128_out_start_dw_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_128_out_start_dw_%=:\n\t"
         "cmp	x19, #4\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_128_out_start_sw_%=\n\t"
         "ldr	w17, [x11], #4\n\t"
         "sub	x19, x19, #4\n\t"
         "str	w17, [%x[out]], #4\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_128_out_start_sw_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_128_out_start_sw_%=:\n\t"
         "cmp	x19, #2\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_128_out_start_byte_%=\n\t"
         "ldrh	w17, [x11], #2\n\t"
         "sub	x19, x19, #2\n\t"
         "strh	w17, [%x[out]], #2\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_128_out_start_byte_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_128_out_start_byte_%=:\n\t"
         "cbz	x19, L_aes_gcm_encrypt_arm64_crypto_128_out_end_bytes_%=\n\t"
         "ldrb	w17, [x11], #1\n\t"
         "subs	x19, x19, #1\n\t"
         "strb	w17, [%x[out]], #1\n\t"
         "b.ne	L_aes_gcm_encrypt_arm64_crypto_128_out_start_byte_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_128_out_end_bytes_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_128_out_end_bytes_%=:\n\t"
         "mov	x17, #16\n\t"
         "sub	x17, x17, x14\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_128_start_zero_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_128_start_zero_%=:\n\t"
         "subs	x17, x17, #1\n\t"
         "strb	wzr, [x11], #1\n\t"
         "b.ne	L_aes_gcm_encrypt_arm64_crypto_128_start_zero_%=\n\t"
@@ -10038,7 +10044,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "eor	v26.16b, v28.16b, v30.16b\n\t"
         /* Done GHASH */
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_128_partial_done_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_128_partial_done_%=:\n\t"
         "ld1	{v14.2d}, [x12]\n\t"
         "lsl	x8, x8, #3\n\t"
         "rbit	x8, x8\n\t"
@@ -10088,7 +10094,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "st1	{v26.16b}, [%x[tag]]\n\t"
         "b	L_aes_gcm_encrypt_arm64_crypto_done_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_128_tag_partial_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_128_tag_partial_%=:\n\t"
         "st1	{v26.16b}, [x11]\n\t"
         "cmp	%w[tagSz], #8\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_128_tag_start_dw_%=\n\t"
@@ -10096,31 +10102,31 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "sub	%w[tagSz], %w[tagSz], #8\n\t"
         "str	x16, [%x[tag]], #8\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_128_tag_start_dw_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_128_tag_start_dw_%=:\n\t"
         "cmp	%w[tagSz], #4\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_128_tag_start_sw_%=\n\t"
         "ldr	w16, [x11], #4\n\t"
         "sub	%w[tagSz], %w[tagSz], #4\n\t"
         "str	w16, [%x[tag]], #4\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_128_tag_start_sw_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_128_tag_start_sw_%=:\n\t"
         "cmp	%w[tagSz], #2\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_128_tag_start_byte_%=\n\t"
         "ldrh	w16, [x11], #2\n\t"
         "sub	%w[tagSz], %w[tagSz], #2\n\t"
         "strh	w16, [%x[tag]], #2\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_128_tag_start_byte_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_128_tag_start_byte_%=:\n\t"
         "cbz	%w[tagSz], L_aes_gcm_encrypt_arm64_crypto_128_tag_end_bytes_%=\n\t"
         "ldrb	w16, [x11], #1\n\t"
         "subs	%w[tagSz], %w[tagSz], #1\n\t"
         "strb	w16, [%x[tag]], #1\n\t"
         "b.ne	L_aes_gcm_encrypt_arm64_crypto_128_tag_start_byte_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_128_tag_end_bytes_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_128_tag_end_bytes_%=:\n\t"
 #endif /* !NO_AES_128 */
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_done_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_done_%=:\n\t"
         "ldp	x29, x30, [sp], #0x50\n\t"
         : [out] "+r" (out), [sz] "+r" (sz), [nonceSz] "+r" (nonceSz),
           [tag] "+r" (tag), [tagSz] "+r" (tagSz), [aadSz] "+r" (aadSz),
@@ -10256,7 +10262,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "eor	v7.16b, v30.16b, v31.16b\n\t"
         /* Done */
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_h_done_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_h_done_%=:\n\t"
         "lsr	w14, w8, #4\n\t"
         "cmp	w14, #4\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_aad_start_1_%=\n\t"
@@ -10265,7 +10271,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "cmp	w14, #0x40\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_aad_start_4_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_aad_start_8_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_aad_start_8_%=:\n\t"
         "ld1	{v18.16b, v19.16b, v20.16b, v21.16b}, [%x[aad]], #0x40\n\t"
         "ld1	{v0.16b, v1.16b, v2.16b, v3.16b}, [%x[aad]], #0x40\n\t"
         "rbit	v18.16b, v18.16b\n\t"
@@ -10372,7 +10378,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "cmp	w14, #16\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_aad_start_2_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_aad_start_4_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_aad_start_4_%=:\n\t"
         "ld1	{v18.16b, v19.16b, v20.16b, v21.16b}, [%x[aad]], #0x40\n\t"
         "rbit	v18.16b, v18.16b\n\t"
         "rbit	v19.16b, v19.16b\n\t"
@@ -10432,7 +10438,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_aad_done_%=\n\t"
         "b.eq	L_aes_gcm_decrypt_arm64_crypto_aad_start_1_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_aad_start_2_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_aad_start_2_%=:\n\t"
         "ld1	{v18.16b, v19.16b}, [%x[aad]], #32\n\t"
         "rbit	v18.16b, v18.16b\n\t"
         "rbit	v19.16b, v19.16b\n\t"
@@ -10468,10 +10474,10 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "b.gt	L_aes_gcm_decrypt_arm64_crypto_aad_start_2_%=\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_aad_done_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_aad_start_1_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_aad_start_1_%=:\n\t"
         "cbz	w14, L_aes_gcm_decrypt_arm64_crypto_aad_done_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_aad_both_1_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_aad_both_1_%=:\n\t"
         "ld1	{v18.16b}, [%x[aad]], #16\n\t"
         "rbit	v18.16b, v18.16b\n\t"
         "eor	v21.16b, v26.16b, v18.16b\n\t"
@@ -10494,7 +10500,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "subs	w14, w14, #1\n\t"
         "b.ne	L_aes_gcm_decrypt_arm64_crypto_aad_both_1_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_aad_done_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_aad_done_%=:\n\t"
         "and	w14, w8, #15\n\t"
         "cbz	w14, L_aes_gcm_decrypt_arm64_crypto_aad_partial_done_%=\n\t"
         "eor	v28.16b, v28.16b, v28.16b\n\t"
@@ -10506,28 +10512,28 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "sub	w20, w20, #8\n\t"
         "str	x19, [x11], #8\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_aad_start_dw_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_aad_start_dw_%=:\n\t"
         "cmp	w20, #4\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_aad_start_sw_%=\n\t"
         "ldr	w19, [%x[aad]], #4\n\t"
         "sub	w20, w20, #4\n\t"
         "str	w19, [x11], #4\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_aad_start_sw_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_aad_start_sw_%=:\n\t"
         "cmp	w20, #2\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_aad_start_byte_%=\n\t"
         "ldrh	w19, [%x[aad]], #2\n\t"
         "sub	w20, w20, #2\n\t"
         "strh	w19, [x11], #2\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_aad_start_byte_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_aad_start_byte_%=:\n\t"
         "cbz	w20, L_aes_gcm_decrypt_arm64_crypto_aad_end_bytes_%=\n\t"
         "ldrb	w19, [%x[aad]], #1\n\t"
         "subs	w20, w20, #1\n\t"
         "strb	w19, [x11], #1\n\t"
         "b.ne	L_aes_gcm_decrypt_arm64_crypto_aad_start_byte_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_aad_end_bytes_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_aad_end_bytes_%=:\n\t"
         "sub	x11, x11, x14\n\t"
         "ld1	{v18.2d}, [x11]\n\t"
         "rbit	v18.16b, v18.16b\n\t"
@@ -10549,7 +10555,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "eor	v26.16b, v28.16b, v30.16b\n\t"
         /* Done GHASH */
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_aad_partial_done_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_aad_partial_done_%=:\n\t"
         /* Load Nonce */
         "cmp	%w[nonceSz], #12\n\t"
         "b.ne	L_aes_gcm_decrypt_arm64_crypto_ghash_nonce_%=\n\t"
@@ -10561,12 +10567,12 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "mov	w15, #1\n\t"
         "b	L_aes_gcm_decrypt_arm64_crypto_done_nonce_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_ghash_nonce_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_ghash_nonce_%=:\n\t"
         "eor	v13.16b, v13.16b, v13.16b\n\t"
         "lsr	w14, %w[nonceSz], #4\n\t"
         "cbz	w14, L_aes_gcm_decrypt_arm64_crypto_nonce_done_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_nonce_start_1_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_nonce_start_1_%=:\n\t"
         "ld1	{v18.16b}, [%x[nonce]], #16\n\t"
         "rbit	v18.16b, v18.16b\n\t"
         "eor	v21.16b, v13.16b, v18.16b\n\t"
@@ -10589,7 +10595,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "subs	w14, w14, #1\n\t"
         "b.ne	L_aes_gcm_decrypt_arm64_crypto_nonce_start_1_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_nonce_done_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_nonce_done_%=:\n\t"
         "and	w24, %w[nonceSz], #15\n\t"
         "cbz	x24, L_aes_gcm_decrypt_arm64_crypto_nonce_partial_done_%=\n\t"
         "eor	v28.16b, v28.16b, v28.16b\n\t"
@@ -10601,28 +10607,28 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "sub	w20, w20, #8\n\t"
         "str	x19, [x11], #8\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_nonce_start_dw_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_nonce_start_dw_%=:\n\t"
         "cmp	w20, #4\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_nonce_start_sw_%=\n\t"
         "ldr	w19, [%x[nonce]], #4\n\t"
         "sub	w20, w20, #4\n\t"
         "str	w19, [x11], #4\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_nonce_start_sw_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_nonce_start_sw_%=:\n\t"
         "cmp	w20, #2\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_nonce_start_byte_%=\n\t"
         "ldrh	w19, [%x[nonce]], #2\n\t"
         "sub	w20, w20, #2\n\t"
         "strh	w19, [x11], #2\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_nonce_start_byte_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_nonce_start_byte_%=:\n\t"
         "cbz	w20, L_aes_gcm_decrypt_arm64_crypto_nonce_end_bytes_%=\n\t"
         "ldrb	w19, [%x[nonce]], #1\n\t"
         "subs	w20, w20, #1\n\t"
         "strb	w19, [x11], #1\n\t"
         "b.ne	L_aes_gcm_decrypt_arm64_crypto_nonce_start_byte_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_nonce_end_bytes_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_nonce_end_bytes_%=:\n\t"
         "sub	x11, x11, x24\n\t"
         "ld1	{v18.2d}, [x11]\n\t"
         "rbit	v18.16b, v18.16b\n\t"
@@ -10644,7 +10650,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "eor	v13.16b, v28.16b, v30.16b\n\t"
         /* Done GHASH */
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_nonce_partial_done_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_nonce_partial_done_%=:\n\t"
         "eor	x14, x14, x14\n\t"
         "lsl	x24, %x[nonceSz], #3\n\t"
         "mov	v28.d[0], x14\n\t"
@@ -10669,7 +10675,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "mov	w15, v13.s[3]\n\t"
         "rev	w15, w15\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_done_nonce_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_done_nonce_%=:\n\t"
         "st1	{v13.2d}, [x12]\n\t"
         "lsr	w14, %w[sz], #4\n\t"
         "cmp	w13, #12\n\t"
@@ -10680,7 +10686,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "cmp	w14, #32\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_192_start_4_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_192_start_8_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_192_start_8_%=:\n\t"
         "ldr	q12, [x9]\n\t"
         "add	w24, w15, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
@@ -10941,7 +10947,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "cmp	w14, #8\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_192_end_8_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_192_both_8_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_192_both_8_%=:\n\t"
         "ldr	q12, [x9]\n\t"
         "add	w24, w15, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
@@ -11297,7 +11303,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "cmp	w14, #8\n\t"
         "b.ge	L_aes_gcm_decrypt_arm64_crypto_192_both_8_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_192_end_8_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_192_end_8_%=:\n\t"
         "rbit	v18.16b, v18.16b\n\t"
         "rbit	v19.16b, v19.16b\n\t"
         "rbit	v20.16b, v20.16b\n\t"
@@ -11394,7 +11400,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "eor	v26.16b, v28.16b, v30.16b\n\t"
         /* Done GHASH */
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_192_start_4_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_192_start_4_%=:\n\t"
         "ld1	{v0.2d, v1.2d, v2.2d, v3.2d}, [x9], #0x40\n\t"
         "ld1	{v4.2d, v5.2d, v6.2d, v7.2d}, [x9], #0x40\n\t"
         "ld1	{v8.2d, v9.2d, v10.2d, v11.2d}, [x9], #0x40\n\t"
@@ -11529,7 +11535,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "st1	{v14.16b, v15.16b, v16.16b, v17.16b}, [%x[out]], #0x40\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_192_end_4_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_192_both_4_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_192_both_4_%=:\n\t"
         "add	w20, w15, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
         "add	w19, w15, #2\n\t"
@@ -11703,7 +11709,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "st1	{v14.16b, v15.16b, v16.16b, v17.16b}, [%x[out]], #0x40\n\t"
         "b.ge	L_aes_gcm_decrypt_arm64_crypto_192_both_4_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_192_end_4_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_192_end_4_%=:\n\t"
         "rbit	v18.16b, v18.16b\n\t"
         "rbit	v19.16b, v19.16b\n\t"
         "rbit	v20.16b, v20.16b\n\t"
@@ -11759,7 +11765,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "b.eq	L_aes_gcm_decrypt_arm64_crypto_192_start_1_%=\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_192_done_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_192_start_2_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_192_start_2_%=:\n\t"
         "add	w20, w15, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
         "add	w15, w15, #2\n\t"
@@ -11853,7 +11859,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         /* Done GHASH */
         "cbz	w14, L_aes_gcm_decrypt_arm64_crypto_192_done_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_192_start_1_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_192_start_1_%=:\n\t"
         "ld1	{v15.16b}, [%x[in]], #16\n\t"
         "add	w15, w15, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
@@ -11905,7 +11911,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "eor	v14.16b, v14.16b, v15.16b\n\t"
         "st1	{v14.16b}, [%x[out]], #16\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_192_done_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_192_done_%=:\n\t"
         "ands	w14, %w[sz], #15\n\t"
         "b.eq	L_aes_gcm_decrypt_arm64_crypto_192_partial_done_%=\n\t"
         "eor	v15.16b, v15.16b, v15.16b\n\t"
@@ -11917,28 +11923,28 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "sub	x19, x19, #8\n\t"
         "str	x17, [x11], #8\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_192_start_dw_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_192_start_dw_%=:\n\t"
         "cmp	x19, #4\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_192_start_sw_%=\n\t"
         "ldr	w17, [%x[in]], #4\n\t"
         "sub	x19, x19, #4\n\t"
         "str	w17, [x11], #4\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_192_start_sw_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_192_start_sw_%=:\n\t"
         "cmp	x19, #2\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_192_start_byte_%=\n\t"
         "ldrh	w17, [%x[in]], #2\n\t"
         "sub	x19, x19, #2\n\t"
         "strh	w17, [x11], #2\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_192_start_byte_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_192_start_byte_%=:\n\t"
         "cbz	x19, L_aes_gcm_decrypt_arm64_crypto_192_end_bytes_%=\n\t"
         "ldrb	w17, [%x[in]], #1\n\t"
         "subs	x19, x19, #1\n\t"
         "strb	w17, [x11], #1\n\t"
         "b.ne	L_aes_gcm_decrypt_arm64_crypto_192_start_byte_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_192_end_bytes_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_192_end_bytes_%=:\n\t"
         "sub	x11, x11, x14\n\t"
         "ld1	{v15.2d}, [x11]\n\t"
         "add	w15, w15, #1\n\t"
@@ -11996,30 +12002,30 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "sub	w14, w14, #8\n\t"
         "str	x17, [%x[out]], #8\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_192_out_start_dw_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_192_out_start_dw_%=:\n\t"
         "cmp	w14, #4\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_192_out_start_sw_%=\n\t"
         "ldr	w17, [x11], #4\n\t"
         "sub	w14, w14, #4\n\t"
         "str	w17, [%x[out]], #4\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_192_out_start_sw_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_192_out_start_sw_%=:\n\t"
         "cmp	w14, #2\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_192_out_start_byte_%=\n\t"
         "ldrh	w17, [x11], #2\n\t"
         "sub	w14, w14, #2\n\t"
         "strh	w17, [%x[out]], #2\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_192_out_start_byte_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_192_out_start_byte_%=:\n\t"
         "cbz	w14, L_aes_gcm_decrypt_arm64_crypto_192_out_end_bytes_%=\n\t"
         "ldrb	w17, [x11], #1\n\t"
         "subs	w14, w14, #1\n\t"
         "strb	w17, [%x[out]], #1\n\t"
         "b.ne	L_aes_gcm_decrypt_arm64_crypto_192_out_start_byte_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_192_out_end_bytes_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_192_out_end_bytes_%=:\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_192_partial_done_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_192_partial_done_%=:\n\t"
         "ld1	{v14.2d}, [x12]\n\t"
         "lsl	x8, x8, #3\n\t"
         "rbit	x8, x8\n\t"
@@ -12073,7 +12079,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "ld1	{v28.16b}, [%x[tag]]\n\t"
         "b	L_aes_gcm_decrypt_arm64_crypto_192_tag_loaded_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_192_part_tag_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_192_part_tag_%=:\n\t"
         "eor	v28.16b, v28.16b, v28.16b\n\t"
         "mov	x17, %x[tagSz]\n\t"
         "st1	{v28.2d}, [x11]\n\t"
@@ -12083,28 +12089,28 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "sub	x17, x17, #8\n\t"
         "str	x16, [x11], #8\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_192_tag_start_dw_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_192_tag_start_dw_%=:\n\t"
         "cmp	x17, #4\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_192_tag_start_sw_%=\n\t"
         "ldr	w16, [%x[tag]], #4\n\t"
         "sub	x17, x17, #4\n\t"
         "str	w16, [x11], #4\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_192_tag_start_sw_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_192_tag_start_sw_%=:\n\t"
         "cmp	x17, #2\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_192_tag_start_byte_%=\n\t"
         "ldrh	w16, [%x[tag]], #2\n\t"
         "sub	x17, x17, #2\n\t"
         "strh	w16, [x11], #2\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_192_tag_start_byte_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_192_tag_start_byte_%=:\n\t"
         "cbz	x17, L_aes_gcm_decrypt_arm64_crypto_192_tag_end_bytes_%=\n\t"
         "ldrb	w16, [%x[tag]], #1\n\t"
         "subs	x17, x17, #1\n\t"
         "strb	w16, [x11], #1\n\t"
         "b.ne	L_aes_gcm_decrypt_arm64_crypto_192_tag_start_byte_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_192_tag_end_bytes_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_192_tag_end_bytes_%=:\n\t"
         "sub	x11, x11, %x[tagSz]\n\t"
         "ld1	{v28.2d}, [x11]\n\t"
         "mov	x17, #16\n\t"
@@ -12112,14 +12118,14 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "sub	x17, x17, %x[tagSz]\n\t"
         "add	x11, x11, %x[tagSz]\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_192_calc_tag_byte_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_192_calc_tag_byte_%=:\n\t"
         "strb	wzr, [x11], #1\n\t"
         "subs	x17, x17, #1\n\t"
         "b.ne	L_aes_gcm_decrypt_arm64_crypto_192_calc_tag_byte_%=\n\t"
         "subs	x11, x11, #16\n\t"
         "ld1	{v26.2d}, [x11]\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_192_tag_loaded_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_192_tag_loaded_%=:\n\t"
         "eor	v28.16b, v28.16b, v26.16b\n\t"
         "mov	x16, v28.d[0]\n\t"
         "mov	x17, v28.d[1]\n\t"
@@ -12132,12 +12138,12 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "b	L_aes_gcm_decrypt_arm64_crypto_done_%=\n\t"
         /* AES_GCM_256 */
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_start_256_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_start_256_%=:\n\t"
 #ifndef NO_AES_256
         "cmp	w14, #32\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_256_start_4_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_256_start_8_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_256_start_8_%=:\n\t"
         "ldr	q12, [x9]\n\t"
         "add	w24, w15, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
@@ -12432,7 +12438,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "cmp	w14, #8\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_256_end_8_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_256_both_8_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_256_both_8_%=:\n\t"
         "ldr	q12, [x9]\n\t"
         "add	w24, w15, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
@@ -12822,7 +12828,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "cmp	w14, #8\n\t"
         "b.ge	L_aes_gcm_decrypt_arm64_crypto_256_both_8_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_256_end_8_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_256_end_8_%=:\n\t"
         "rbit	v18.16b, v18.16b\n\t"
         "rbit	v19.16b, v19.16b\n\t"
         "rbit	v20.16b, v20.16b\n\t"
@@ -12919,7 +12925,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "eor	v26.16b, v28.16b, v30.16b\n\t"
         /* Done GHASH */
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_256_start_4_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_256_start_4_%=:\n\t"
         "ld1	{v0.2d, v1.2d, v2.2d, v3.2d}, [x9], #0x40\n\t"
         "ld1	{v4.2d, v5.2d, v6.2d, v7.2d}, [x9], #0x40\n\t"
         "ld1	{v8.2d, v9.2d, v10.2d, v11.2d}, [x9], #0x40\n\t"
@@ -13071,7 +13077,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "st1	{v14.16b, v15.16b, v16.16b, v17.16b}, [%x[out]], #0x40\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_256_end_4_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_256_both_4_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_256_both_4_%=:\n\t"
         "add	w20, w15, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
         "add	w19, w15, #2\n\t"
@@ -13262,7 +13268,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "st1	{v14.16b, v15.16b, v16.16b, v17.16b}, [%x[out]], #0x40\n\t"
         "b.ge	L_aes_gcm_decrypt_arm64_crypto_256_both_4_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_256_end_4_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_256_end_4_%=:\n\t"
         "rbit	v18.16b, v18.16b\n\t"
         "rbit	v19.16b, v19.16b\n\t"
         "rbit	v20.16b, v20.16b\n\t"
@@ -13318,7 +13324,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "b.eq	L_aes_gcm_decrypt_arm64_crypto_256_start_1_%=\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_256_done_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_256_start_2_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_256_start_2_%=:\n\t"
         "add	w20, w15, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
         "add	w15, w15, #2\n\t"
@@ -13421,7 +13427,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         /* Done GHASH */
         "cbz	w14, L_aes_gcm_decrypt_arm64_crypto_256_done_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_256_start_1_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_256_start_1_%=:\n\t"
         "add	w15, w15, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
         "rev	w16, w15\n\t"
@@ -13478,7 +13484,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "eor	v26.16b, v28.16b, v30.16b\n\t"
         /* Done GHASH */
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_256_done_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_256_done_%=:\n\t"
         "ands	w14, %w[sz], #15\n\t"
         "b.eq	L_aes_gcm_decrypt_arm64_crypto_256_partial_done_%=\n\t"
         "eor	v15.16b, v15.16b, v15.16b\n\t"
@@ -13490,28 +13496,28 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "sub	x19, x19, #8\n\t"
         "str	x17, [x11], #8\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_256_start_dw_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_256_start_dw_%=:\n\t"
         "cmp	x19, #4\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_256_start_sw_%=\n\t"
         "ldr	w17, [%x[in]], #4\n\t"
         "sub	x19, x19, #4\n\t"
         "str	w17, [x11], #4\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_256_start_sw_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_256_start_sw_%=:\n\t"
         "cmp	x19, #2\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_256_start_byte_%=\n\t"
         "ldrh	w17, [%x[in]], #2\n\t"
         "sub	x19, x19, #2\n\t"
         "strh	w17, [x11], #2\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_256_start_byte_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_256_start_byte_%=:\n\t"
         "cbz	x19, L_aes_gcm_decrypt_arm64_crypto_256_end_bytes_%=\n\t"
         "ldrb	w17, [%x[in]], #1\n\t"
         "subs	x19, x19, #1\n\t"
         "strb	w17, [x11], #1\n\t"
         "b.ne	L_aes_gcm_decrypt_arm64_crypto_256_start_byte_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_256_end_bytes_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_256_end_bytes_%=:\n\t"
         "sub	x11, x11, x14\n\t"
         "ld1	{v15.2d}, [x11]\n\t"
         "add	w15, w15, #1\n\t"
@@ -13575,30 +13581,30 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "sub	w14, w14, #8\n\t"
         "str	x17, [%x[out]], #8\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_256_out_start_dw_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_256_out_start_dw_%=:\n\t"
         "cmp	w14, #4\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_256_out_start_sw_%=\n\t"
         "ldr	w17, [x11], #4\n\t"
         "sub	w14, w14, #4\n\t"
         "str	w17, [%x[out]], #4\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_256_out_start_sw_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_256_out_start_sw_%=:\n\t"
         "cmp	w14, #2\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_256_out_start_byte_%=\n\t"
         "ldrh	w17, [x11], #2\n\t"
         "sub	w14, w14, #2\n\t"
         "strh	w17, [%x[out]], #2\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_256_out_start_byte_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_256_out_start_byte_%=:\n\t"
         "cbz	w14, L_aes_gcm_decrypt_arm64_crypto_256_out_end_bytes_%=\n\t"
         "ldrb	w17, [x11], #1\n\t"
         "subs	w14, w14, #1\n\t"
         "strb	w17, [%x[out]], #1\n\t"
         "b.ne	L_aes_gcm_decrypt_arm64_crypto_256_out_start_byte_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_256_out_end_bytes_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_256_out_end_bytes_%=:\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_256_partial_done_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_256_partial_done_%=:\n\t"
         "ld1	{v14.2d}, [x12]\n\t"
         "lsl	x8, x8, #3\n\t"
         "rbit	x8, x8\n\t"
@@ -13660,7 +13666,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "ld1	{v28.16b}, [%x[tag]]\n\t"
         "b	L_aes_gcm_decrypt_arm64_crypto_256_tag_loaded_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_256_part_tag_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_256_part_tag_%=:\n\t"
         "eor	v28.16b, v28.16b, v28.16b\n\t"
         "mov	x17, %x[tagSz]\n\t"
         "st1	{v28.2d}, [x11]\n\t"
@@ -13670,28 +13676,28 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "sub	x17, x17, #8\n\t"
         "str	x16, [x11], #8\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_256_tag_start_dw_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_256_tag_start_dw_%=:\n\t"
         "cmp	x17, #4\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_256_tag_start_sw_%=\n\t"
         "ldr	w16, [%x[tag]], #4\n\t"
         "sub	x17, x17, #4\n\t"
         "str	w16, [x11], #4\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_256_tag_start_sw_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_256_tag_start_sw_%=:\n\t"
         "cmp	x17, #2\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_256_tag_start_byte_%=\n\t"
         "ldrh	w16, [%x[tag]], #2\n\t"
         "sub	x17, x17, #2\n\t"
         "strh	w16, [x11], #2\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_256_tag_start_byte_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_256_tag_start_byte_%=:\n\t"
         "cbz	x17, L_aes_gcm_decrypt_arm64_crypto_256_tag_end_bytes_%=\n\t"
         "ldrb	w16, [%x[tag]], #1\n\t"
         "subs	x17, x17, #1\n\t"
         "strb	w16, [x11], #1\n\t"
         "b.ne	L_aes_gcm_decrypt_arm64_crypto_256_tag_start_byte_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_256_tag_end_bytes_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_256_tag_end_bytes_%=:\n\t"
         "sub	x11, x11, %x[tagSz]\n\t"
         "ld1	{v28.2d}, [x11]\n\t"
         "mov	x17, #16\n\t"
@@ -13699,14 +13705,14 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "sub	x17, x17, %x[tagSz]\n\t"
         "add	x11, x11, %x[tagSz]\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_256_calc_tag_byte_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_256_calc_tag_byte_%=:\n\t"
         "strb	wzr, [x11], #1\n\t"
         "subs	x17, x17, #1\n\t"
         "b.ne	L_aes_gcm_decrypt_arm64_crypto_256_calc_tag_byte_%=\n\t"
         "subs	x11, x11, #16\n\t"
         "ld1	{v26.2d}, [x11]\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_256_tag_loaded_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_256_tag_loaded_%=:\n\t"
         "eor	v28.16b, v28.16b, v26.16b\n\t"
         "mov	x16, v28.d[0]\n\t"
         "mov	x17, v28.d[1]\n\t"
@@ -13719,12 +13725,12 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "b	L_aes_gcm_decrypt_arm64_crypto_done_%=\n\t"
         /* AES_GCM_128 */
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_start_128_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_start_128_%=:\n\t"
 #ifndef NO_AES_128
         "cmp	w14, #32\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_128_start_4_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_128_start_8_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_128_start_8_%=:\n\t"
         "ldr	q12, [x9]\n\t"
         "add	w24, w15, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
@@ -13951,7 +13957,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "cmp	w14, #8\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_128_end_8_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_128_both_8_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_128_both_8_%=:\n\t"
         "ldr	q12, [x9]\n\t"
         "add	w24, w15, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
@@ -14273,7 +14279,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "cmp	w14, #8\n\t"
         "b.ge	L_aes_gcm_decrypt_arm64_crypto_128_both_8_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_128_end_8_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_128_end_8_%=:\n\t"
         "rbit	v18.16b, v18.16b\n\t"
         "rbit	v19.16b, v19.16b\n\t"
         "rbit	v20.16b, v20.16b\n\t"
@@ -14370,7 +14376,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "eor	v26.16b, v28.16b, v30.16b\n\t"
         /* Done GHASH */
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_128_start_4_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_128_start_4_%=:\n\t"
         "ld1	{v0.2d, v1.2d, v2.2d, v3.2d}, [x9], #0x40\n\t"
         "ld1	{v4.2d, v5.2d, v6.2d, v7.2d}, [x9], #0x40\n\t"
         "ld1	{v8.2d, v9.2d}, [x9], #32\n\t"
@@ -14489,7 +14495,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "st1	{v14.16b, v15.16b, v16.16b, v17.16b}, [%x[out]], #0x40\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_128_end_4_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_128_both_4_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_128_both_4_%=:\n\t"
         "add	w20, w15, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
         "add	w19, w15, #2\n\t"
@@ -14647,7 +14653,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "st1	{v14.16b, v15.16b, v16.16b, v17.16b}, [%x[out]], #0x40\n\t"
         "b.ge	L_aes_gcm_decrypt_arm64_crypto_128_both_4_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_128_end_4_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_128_end_4_%=:\n\t"
         "rbit	v18.16b, v18.16b\n\t"
         "rbit	v19.16b, v19.16b\n\t"
         "rbit	v20.16b, v20.16b\n\t"
@@ -14703,7 +14709,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "b.eq	L_aes_gcm_decrypt_arm64_crypto_128_start_1_%=\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_128_done_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_128_start_2_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_128_start_2_%=:\n\t"
         "add	w20, w15, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
         "add	w15, w15, #2\n\t"
@@ -14789,7 +14795,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         /* Done GHASH */
         "cbz	w14, L_aes_gcm_decrypt_arm64_crypto_128_done_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_128_start_1_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_128_start_1_%=:\n\t"
         "add	w15, w15, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
         "rev	w16, w15\n\t"
@@ -14836,7 +14842,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "eor	v26.16b, v28.16b, v30.16b\n\t"
         /* Done GHASH */
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_128_done_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_128_done_%=:\n\t"
         "ands	w14, %w[sz], #15\n\t"
         "b.eq	L_aes_gcm_decrypt_arm64_crypto_128_partial_done_%=\n\t"
         "eor	v15.16b, v15.16b, v15.16b\n\t"
@@ -14848,28 +14854,28 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "sub	x19, x19, #8\n\t"
         "str	x17, [x11], #8\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_128_start_dw_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_128_start_dw_%=:\n\t"
         "cmp	x19, #4\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_128_start_sw_%=\n\t"
         "ldr	w17, [%x[in]], #4\n\t"
         "sub	x19, x19, #4\n\t"
         "str	w17, [x11], #4\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_128_start_sw_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_128_start_sw_%=:\n\t"
         "cmp	x19, #2\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_128_start_byte_%=\n\t"
         "ldrh	w17, [%x[in]], #2\n\t"
         "sub	x19, x19, #2\n\t"
         "strh	w17, [x11], #2\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_128_start_byte_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_128_start_byte_%=:\n\t"
         "cbz	x19, L_aes_gcm_decrypt_arm64_crypto_128_end_bytes_%=\n\t"
         "ldrb	w17, [%x[in]], #1\n\t"
         "subs	x19, x19, #1\n\t"
         "strb	w17, [x11], #1\n\t"
         "b.ne	L_aes_gcm_decrypt_arm64_crypto_128_start_byte_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_128_end_bytes_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_128_end_bytes_%=:\n\t"
         "sub	x11, x11, x14\n\t"
         "ld1	{v15.2d}, [x11]\n\t"
         "add	w15, w15, #1\n\t"
@@ -14923,30 +14929,30 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "sub	w14, w14, #8\n\t"
         "str	x17, [%x[out]], #8\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_128_out_start_dw_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_128_out_start_dw_%=:\n\t"
         "cmp	w14, #4\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_128_out_start_sw_%=\n\t"
         "ldr	w17, [x11], #4\n\t"
         "sub	w14, w14, #4\n\t"
         "str	w17, [%x[out]], #4\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_128_out_start_sw_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_128_out_start_sw_%=:\n\t"
         "cmp	w14, #2\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_128_out_start_byte_%=\n\t"
         "ldrh	w17, [x11], #2\n\t"
         "sub	w14, w14, #2\n\t"
         "strh	w17, [%x[out]], #2\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_128_out_start_byte_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_128_out_start_byte_%=:\n\t"
         "cbz	w14, L_aes_gcm_decrypt_arm64_crypto_128_out_end_bytes_%=\n\t"
         "ldrb	w17, [x11], #1\n\t"
         "subs	w14, w14, #1\n\t"
         "strb	w17, [%x[out]], #1\n\t"
         "b.ne	L_aes_gcm_decrypt_arm64_crypto_128_out_start_byte_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_128_out_end_bytes_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_128_out_end_bytes_%=:\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_128_partial_done_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_128_partial_done_%=:\n\t"
         "ld1	{v14.2d}, [x12]\n\t"
         "lsl	x8, x8, #3\n\t"
         "rbit	x8, x8\n\t"
@@ -14996,7 +15002,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "ld1	{v28.16b}, [%x[tag]]\n\t"
         "b	L_aes_gcm_decrypt_arm64_crypto_128_tag_loaded_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_128_part_tag_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_128_part_tag_%=:\n\t"
         "eor	v28.16b, v28.16b, v28.16b\n\t"
         "mov	x17, %x[tagSz]\n\t"
         "st1	{v28.2d}, [x11]\n\t"
@@ -15006,28 +15012,28 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "sub	x17, x17, #8\n\t"
         "str	x16, [x11], #8\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_128_tag_start_dw_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_128_tag_start_dw_%=:\n\t"
         "cmp	x17, #4\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_128_tag_start_sw_%=\n\t"
         "ldr	w16, [%x[tag]], #4\n\t"
         "sub	x17, x17, #4\n\t"
         "str	w16, [x11], #4\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_128_tag_start_sw_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_128_tag_start_sw_%=:\n\t"
         "cmp	x17, #2\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_128_tag_start_byte_%=\n\t"
         "ldrh	w16, [%x[tag]], #2\n\t"
         "sub	x17, x17, #2\n\t"
         "strh	w16, [x11], #2\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_128_tag_start_byte_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_128_tag_start_byte_%=:\n\t"
         "cbz	x17, L_aes_gcm_decrypt_arm64_crypto_128_tag_end_bytes_%=\n\t"
         "ldrb	w16, [%x[tag]], #1\n\t"
         "subs	x17, x17, #1\n\t"
         "strb	w16, [x11], #1\n\t"
         "b.ne	L_aes_gcm_decrypt_arm64_crypto_128_tag_start_byte_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_128_tag_end_bytes_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_128_tag_end_bytes_%=:\n\t"
         "sub	x11, x11, %x[tagSz]\n\t"
         "ld1	{v28.2d}, [x11]\n\t"
         "mov	x17, #16\n\t"
@@ -15035,14 +15041,14 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "sub	x17, x17, %x[tagSz]\n\t"
         "add	x11, x11, %x[tagSz]\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_128_calc_tag_byte_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_128_calc_tag_byte_%=:\n\t"
         "strb	wzr, [x11], #1\n\t"
         "subs	x17, x17, #1\n\t"
         "b.ne	L_aes_gcm_decrypt_arm64_crypto_128_calc_tag_byte_%=\n\t"
         "subs	x11, x11, #16\n\t"
         "ld1	{v26.2d}, [x11]\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_128_tag_loaded_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_128_tag_loaded_%=:\n\t"
         "eor	v28.16b, v28.16b, v26.16b\n\t"
         "mov	x16, v28.d[0]\n\t"
         "mov	x17, v28.d[1]\n\t"
@@ -15053,7 +15059,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "and	%x[in], %x[in], x19\n\t"
 #endif /* !NO_AES_128 */
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_done_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_done_%=:\n\t"
         "ldp	x29, x30, [sp], #0x50\n\t"
         : [out] "+r" (out), [sz] "+r" (sz), [nonceSz] "+r" (nonceSz),
           [tagSz] "+r" (tagSz), [aadSz] "+r" (aadSz), [key] "+r" (key),
@@ -15187,7 +15193,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "eor	v7.16b, v30.16b, v31.16b\n\t"
         /* Done */
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_h_done_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_h_done_%=:\n\t"
         "lsr	w14, w8, #4\n\t"
         "cmp	w14, #4\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_eor3_aad_start_1_%=\n\t"
@@ -15196,7 +15202,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "cmp	w14, #0x40\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_eor3_aad_start_4_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_aad_start_8_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_aad_start_8_%=:\n\t"
         "ld1	{v18.16b, v19.16b, v20.16b, v21.16b}, [%x[aad]], #0x40\n\t"
         "ld1	{v0.16b, v1.16b, v2.16b, v3.16b}, [%x[aad]], #0x40\n\t"
         "rbit	v18.16b, v18.16b\n\t"
@@ -15295,7 +15301,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "cmp	w14, #16\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_eor3_aad_start_2_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_aad_start_4_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_aad_start_4_%=:\n\t"
         "ld1	{v18.16b, v19.16b, v20.16b, v21.16b}, [%x[aad]], #0x40\n\t"
         "rbit	v18.16b, v18.16b\n\t"
         "rbit	v19.16b, v19.16b\n\t"
@@ -15351,7 +15357,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_eor3_aad_done_%=\n\t"
         "b.eq	L_aes_gcm_encrypt_arm64_crypto_eor3_aad_start_1_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_aad_start_2_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_aad_start_2_%=:\n\t"
         "ld1	{v18.16b, v19.16b}, [%x[aad]], #32\n\t"
         "rbit	v18.16b, v18.16b\n\t"
         "rbit	v19.16b, v19.16b\n\t"
@@ -15385,10 +15391,10 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "b.gt	L_aes_gcm_encrypt_arm64_crypto_eor3_aad_start_2_%=\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_eor3_aad_done_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_aad_start_1_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_aad_start_1_%=:\n\t"
         "cbz	w14, L_aes_gcm_encrypt_arm64_crypto_eor3_aad_done_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_aad_both_1_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_aad_both_1_%=:\n\t"
         "ld1	{v18.16b}, [%x[aad]], #16\n\t"
         "rbit	v18.16b, v18.16b\n\t"
         "eor	v21.16b, v26.16b, v18.16b\n\t"
@@ -15410,7 +15416,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "subs	w14, w14, #1\n\t"
         "b.ne	L_aes_gcm_encrypt_arm64_crypto_eor3_aad_both_1_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_aad_done_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_aad_done_%=:\n\t"
         "and	w14, w8, #15\n\t"
         "cbz	w14, L_aes_gcm_encrypt_arm64_crypto_eor3_aad_partial_done_%=\n\t"
         "eor	v28.16b, v28.16b, v28.16b\n\t"
@@ -15422,28 +15428,28 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "sub	w20, w20, #8\n\t"
         "str	x19, [x11], #8\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_aad_start_dw_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_aad_start_dw_%=:\n\t"
         "cmp	w20, #4\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_eor3_aad_start_sw_%=\n\t"
         "ldr	w19, [%x[aad]], #4\n\t"
         "sub	w20, w20, #4\n\t"
         "str	w19, [x11], #4\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_aad_start_sw_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_aad_start_sw_%=:\n\t"
         "cmp	w20, #2\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_eor3_aad_start_byte_%=\n\t"
         "ldrh	w19, [%x[aad]], #2\n\t"
         "sub	w20, w20, #2\n\t"
         "strh	w19, [x11], #2\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_aad_start_byte_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_aad_start_byte_%=:\n\t"
         "cbz	w20, L_aes_gcm_encrypt_arm64_crypto_eor3_aad_end_bytes_%=\n\t"
         "ldrb	w19, [%x[aad]], #1\n\t"
         "subs	w20, w20, #1\n\t"
         "strb	w19, [x11], #1\n\t"
         "b.ne	L_aes_gcm_encrypt_arm64_crypto_eor3_aad_start_byte_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_aad_end_bytes_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_aad_end_bytes_%=:\n\t"
         "sub	x11, x11, x14\n\t"
         "ld1	{v18.2d}, [x11]\n\t"
         "rbit	v18.16b, v18.16b\n\t"
@@ -15464,7 +15470,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "eor	v26.16b, v28.16b, v30.16b\n\t"
         /* Done GHASH */
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_aad_partial_done_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_aad_partial_done_%=:\n\t"
         /* Load Nonce */
         "cmp	%w[nonceSz], #12\n\t"
         "b.ne	L_aes_gcm_encrypt_arm64_crypto_eor3_ghash_nonce_%=\n\t"
@@ -15476,12 +15482,12 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "mov	w15, #1\n\t"
         "b	L_aes_gcm_encrypt_arm64_crypto_eor3_done_nonce_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_ghash_nonce_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_ghash_nonce_%=:\n\t"
         "eor	v13.16b, v13.16b, v13.16b\n\t"
         "lsr	w14, %w[nonceSz], #4\n\t"
         "cbz	w14, L_aes_gcm_encrypt_arm64_crypto_eor3_nonce_done_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_nonce_start_1_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_nonce_start_1_%=:\n\t"
         "ld1	{v18.16b}, [%x[nonce]], #16\n\t"
         "rbit	v18.16b, v18.16b\n\t"
         "eor	v21.16b, v13.16b, v18.16b\n\t"
@@ -15503,7 +15509,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "subs	w14, w14, #1\n\t"
         "b.ne	L_aes_gcm_encrypt_arm64_crypto_eor3_nonce_start_1_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_nonce_done_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_nonce_done_%=:\n\t"
         "and	w24, %w[nonceSz], #15\n\t"
         "cbz	x24, L_aes_gcm_encrypt_arm64_crypto_eor3_nonce_partial_done_%=\n\t"
         "eor	v28.16b, v28.16b, v28.16b\n\t"
@@ -15515,28 +15521,28 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "sub	w20, w20, #8\n\t"
         "str	x19, [x11], #8\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_nonce_start_dw_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_nonce_start_dw_%=:\n\t"
         "cmp	w20, #4\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_eor3_nonce_start_sw_%=\n\t"
         "ldr	w19, [%x[nonce]], #4\n\t"
         "sub	w20, w20, #4\n\t"
         "str	w19, [x11], #4\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_nonce_start_sw_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_nonce_start_sw_%=:\n\t"
         "cmp	w20, #2\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_eor3_nonce_start_byte_%=\n\t"
         "ldrh	w19, [%x[nonce]], #2\n\t"
         "sub	w20, w20, #2\n\t"
         "strh	w19, [x11], #2\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_nonce_start_byte_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_nonce_start_byte_%=:\n\t"
         "cbz	w20, L_aes_gcm_encrypt_arm64_crypto_eor3_nonce_end_bytes_%=\n\t"
         "ldrb	w19, [%x[nonce]], #1\n\t"
         "subs	w20, w20, #1\n\t"
         "strb	w19, [x11], #1\n\t"
         "b.ne	L_aes_gcm_encrypt_arm64_crypto_eor3_nonce_start_byte_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_nonce_end_bytes_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_nonce_end_bytes_%=:\n\t"
         "sub	x11, x11, x24\n\t"
         "ld1	{v18.2d}, [x11]\n\t"
         "rbit	v18.16b, v18.16b\n\t"
@@ -15557,7 +15563,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "eor	v13.16b, v28.16b, v30.16b\n\t"
         /* Done GHASH */
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_nonce_partial_done_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_nonce_partial_done_%=:\n\t"
         "eor	x14, x14, x14\n\t"
         "lsl	x24, %x[nonceSz], #3\n\t"
         "mov	v28.d[0], x14\n\t"
@@ -15581,7 +15587,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "mov	w15, v13.s[3]\n\t"
         "rev	w15, w15\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_done_nonce_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_done_nonce_%=:\n\t"
         "st1	{v13.2d}, [x12]\n\t"
         "lsr	w14, %w[sz], #4\n\t"
         "cmp	w13, #12\n\t"
@@ -15592,7 +15598,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "cmp	w14, #32\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_eor3_192_start_4_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_192_start_8_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_192_start_8_%=:\n\t"
         "ldr	q12, [x9]\n\t"
         "add	w24, w15, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
@@ -15853,7 +15859,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "cmp	w14, #8\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_eor3_192_end_8_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_192_both_8_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_192_both_8_%=:\n\t"
         "ldr	q12, [x9]\n\t"
         "add	w24, w15, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
@@ -16201,7 +16207,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "cmp	w14, #8\n\t"
         "b.ge	L_aes_gcm_encrypt_arm64_crypto_eor3_192_both_8_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_192_end_8_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_192_end_8_%=:\n\t"
         "rbit	v18.16b, v18.16b\n\t"
         "rbit	v19.16b, v19.16b\n\t"
         "rbit	v20.16b, v20.16b\n\t"
@@ -16290,7 +16296,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "eor	v26.16b, v28.16b, v30.16b\n\t"
         /* Done GHASH */
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_192_start_4_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_192_start_4_%=:\n\t"
         "ld1	{v0.2d, v1.2d, v2.2d, v3.2d}, [x9], #0x40\n\t"
         "ld1	{v4.2d, v5.2d, v6.2d, v7.2d}, [x9], #0x40\n\t"
         "ld1	{v8.2d, v9.2d, v10.2d, v11.2d}, [x9], #0x40\n\t"
@@ -16425,7 +16431,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "st1	{v18.16b, v19.16b, v20.16b, v21.16b}, [%x[out]], #0x40\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_eor3_192_end_4_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_192_both_4_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_192_both_4_%=:\n\t"
         "add	w20, w15, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
         "add	w19, w15, #2\n\t"
@@ -16595,7 +16601,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "st1	{v18.16b, v19.16b, v20.16b, v21.16b}, [%x[out]], #0x40\n\t"
         "b.ge	L_aes_gcm_encrypt_arm64_crypto_eor3_192_both_4_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_192_end_4_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_192_end_4_%=:\n\t"
         "rbit	v18.16b, v18.16b\n\t"
         "rbit	v19.16b, v19.16b\n\t"
         "rbit	v20.16b, v20.16b\n\t"
@@ -16647,7 +16653,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "b.eq	L_aes_gcm_encrypt_arm64_crypto_eor3_192_start_1_%=\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_eor3_192_done_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_192_start_2_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_192_start_2_%=:\n\t"
         "add	w20, w15, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
         "add	w15, w15, #2\n\t"
@@ -16739,7 +16745,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         /* Done GHASH */
         "cbz	w14, L_aes_gcm_encrypt_arm64_crypto_eor3_192_done_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_192_start_1_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_192_start_1_%=:\n\t"
         "add	w15, w15, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
         "rev	w16, w15\n\t"
@@ -16789,7 +16795,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "eor	v26.16b, v28.16b, v30.16b\n\t"
         /* Done GHASH */
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_192_done_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_192_done_%=:\n\t"
         "ands	w14, %w[sz], #15\n\t"
         "b.eq	L_aes_gcm_encrypt_arm64_crypto_eor3_192_partial_done_%=\n\t"
         "eor	v16.16b, v16.16b, v16.16b\n\t"
@@ -16801,28 +16807,28 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "sub	x19, x19, #8\n\t"
         "str	x17, [x11], #8\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_192_start_dw_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_192_start_dw_%=:\n\t"
         "cmp	x19, #4\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_eor3_192_start_sw_%=\n\t"
         "ldr	w17, [%x[in]], #4\n\t"
         "sub	x19, x19, #4\n\t"
         "str	w17, [x11], #4\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_192_start_sw_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_192_start_sw_%=:\n\t"
         "cmp	x19, #2\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_eor3_192_start_byte_%=\n\t"
         "ldrh	w17, [%x[in]], #2\n\t"
         "sub	x19, x19, #2\n\t"
         "strh	w17, [x11], #2\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_192_start_byte_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_192_start_byte_%=:\n\t"
         "cbz	x19, L_aes_gcm_encrypt_arm64_crypto_eor3_192_end_bytes_%=\n\t"
         "ldrb	w17, [%x[in]], #1\n\t"
         "subs	x19, x19, #1\n\t"
         "strb	w17, [x11], #1\n\t"
         "b.ne	L_aes_gcm_encrypt_arm64_crypto_eor3_192_start_byte_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_192_end_bytes_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_192_end_bytes_%=:\n\t"
         "sub	x11, x11, x14\n\t"
         "ld1	{v16.2d}, [x11]\n\t"
         "add	w15, w15, #1\n\t"
@@ -16862,32 +16868,32 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "sub	x19, x19, #8\n\t"
         "str	x17, [%x[out]], #8\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_192_out_start_dw_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_192_out_start_dw_%=:\n\t"
         "cmp	x19, #4\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_eor3_192_out_start_sw_%=\n\t"
         "ldr	w17, [x11], #4\n\t"
         "sub	x19, x19, #4\n\t"
         "str	w17, [%x[out]], #4\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_192_out_start_sw_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_192_out_start_sw_%=:\n\t"
         "cmp	x19, #2\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_eor3_192_out_start_byte_%=\n\t"
         "ldrh	w17, [x11], #2\n\t"
         "sub	x19, x19, #2\n\t"
         "strh	w17, [%x[out]], #2\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_192_out_start_byte_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_192_out_start_byte_%=:\n\t"
         "cbz	x19, L_aes_gcm_encrypt_arm64_crypto_eor3_192_out_end_bytes_%=\n\t"
         "ldrb	w17, [x11], #1\n\t"
         "subs	x19, x19, #1\n\t"
         "strb	w17, [%x[out]], #1\n\t"
         "b.ne	L_aes_gcm_encrypt_arm64_crypto_eor3_192_out_start_byte_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_192_out_end_bytes_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_192_out_end_bytes_%=:\n\t"
         "mov	x17, #16\n\t"
         "sub	x17, x17, x14\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_192_start_zero_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_192_start_zero_%=:\n\t"
         "subs	x17, x17, #1\n\t"
         "strb	wzr, [x11], #1\n\t"
         "b.ne	L_aes_gcm_encrypt_arm64_crypto_eor3_192_start_zero_%=\n\t"
@@ -16911,7 +16917,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "eor	v26.16b, v28.16b, v30.16b\n\t"
         /* Done GHASH */
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_192_partial_done_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_192_partial_done_%=:\n\t"
         "ld1	{v14.2d}, [x12]\n\t"
         "lsl	x8, x8, #3\n\t"
         "rbit	x8, x8\n\t"
@@ -16964,7 +16970,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "st1	{v26.16b}, [%x[tag]]\n\t"
         "b	L_aes_gcm_encrypt_arm64_crypto_eor3_done_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_192_tag_partial_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_192_tag_partial_%=:\n\t"
         "st1	{v26.16b}, [x11]\n\t"
         "cmp	%w[tagSz], #8\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_eor3_192_tag_start_dw_%=\n\t"
@@ -16972,38 +16978,38 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "sub	%w[tagSz], %w[tagSz], #8\n\t"
         "str	x16, [%x[tag]], #8\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_192_tag_start_dw_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_192_tag_start_dw_%=:\n\t"
         "cmp	%w[tagSz], #4\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_eor3_192_tag_start_sw_%=\n\t"
         "ldr	w16, [x11], #4\n\t"
         "sub	%w[tagSz], %w[tagSz], #4\n\t"
         "str	w16, [%x[tag]], #4\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_192_tag_start_sw_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_192_tag_start_sw_%=:\n\t"
         "cmp	%w[tagSz], #2\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_eor3_192_tag_start_byte_%=\n\t"
         "ldrh	w16, [x11], #2\n\t"
         "sub	%w[tagSz], %w[tagSz], #2\n\t"
         "strh	w16, [%x[tag]], #2\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_192_tag_start_byte_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_192_tag_start_byte_%=:\n\t"
         "cbz	%w[tagSz], L_aes_gcm_encrypt_arm64_crypto_eor3_192_tag_end_bytes_%=\n\t"
         "ldrb	w16, [x11], #1\n\t"
         "subs	%w[tagSz], %w[tagSz], #1\n\t"
         "strb	w16, [%x[tag]], #1\n\t"
         "b.ne	L_aes_gcm_encrypt_arm64_crypto_eor3_192_tag_start_byte_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_192_tag_end_bytes_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_192_tag_end_bytes_%=:\n\t"
 #endif /* !NO_AES_192 */
         "b	L_aes_gcm_encrypt_arm64_crypto_eor3_done_%=\n\t"
         /* AES_GCM_256 */
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_start_256_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_start_256_%=:\n\t"
 #ifndef NO_AES_256
         "cmp	w14, #32\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_eor3_256_start_4_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_256_start_8_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_256_start_8_%=:\n\t"
         "ldr	q12, [x9]\n\t"
         "add	w24, w15, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
@@ -17298,7 +17304,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "cmp	w14, #8\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_eor3_256_end_8_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_256_both_8_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_256_both_8_%=:\n\t"
         "ldr	q12, [x9]\n\t"
         "add	w24, w15, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
@@ -17680,7 +17686,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "cmp	w14, #8\n\t"
         "b.ge	L_aes_gcm_encrypt_arm64_crypto_eor3_256_both_8_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_256_end_8_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_256_end_8_%=:\n\t"
         "rbit	v18.16b, v18.16b\n\t"
         "rbit	v19.16b, v19.16b\n\t"
         "rbit	v20.16b, v20.16b\n\t"
@@ -17769,7 +17775,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "eor	v26.16b, v28.16b, v30.16b\n\t"
         /* Done GHASH */
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_256_start_4_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_256_start_4_%=:\n\t"
         "ld1	{v0.2d, v1.2d, v2.2d, v3.2d}, [x9], #0x40\n\t"
         "ld1	{v4.2d, v5.2d, v6.2d, v7.2d}, [x9], #0x40\n\t"
         "ld1	{v8.2d, v9.2d, v10.2d, v11.2d}, [x9], #0x40\n\t"
@@ -17921,7 +17927,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "st1	{v18.16b, v19.16b, v20.16b, v21.16b}, [%x[out]], #0x40\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_eor3_256_end_4_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_256_both_4_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_256_both_4_%=:\n\t"
         "add	w20, w15, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
         "add	w19, w15, #2\n\t"
@@ -18108,7 +18114,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "st1	{v18.16b, v19.16b, v20.16b, v21.16b}, [%x[out]], #0x40\n\t"
         "b.ge	L_aes_gcm_encrypt_arm64_crypto_eor3_256_both_4_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_256_end_4_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_256_end_4_%=:\n\t"
         "rbit	v18.16b, v18.16b\n\t"
         "rbit	v19.16b, v19.16b\n\t"
         "rbit	v20.16b, v20.16b\n\t"
@@ -18160,7 +18166,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "b.eq	L_aes_gcm_encrypt_arm64_crypto_eor3_256_start_1_%=\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_eor3_256_done_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_256_start_2_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_256_start_2_%=:\n\t"
         "add	w20, w15, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
         "add	w15, w15, #2\n\t"
@@ -18261,7 +18267,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         /* Done GHASH */
         "cbz	w14, L_aes_gcm_encrypt_arm64_crypto_eor3_256_done_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_256_start_1_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_256_start_1_%=:\n\t"
         "add	w15, w15, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
         "rev	w16, w15\n\t"
@@ -18317,7 +18323,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "eor	v26.16b, v28.16b, v30.16b\n\t"
         /* Done GHASH */
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_256_done_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_256_done_%=:\n\t"
         "ands	w14, %w[sz], #15\n\t"
         "b.eq	L_aes_gcm_encrypt_arm64_crypto_eor3_256_partial_done_%=\n\t"
         "eor	v16.16b, v16.16b, v16.16b\n\t"
@@ -18329,28 +18335,28 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "sub	x19, x19, #8\n\t"
         "str	x17, [x11], #8\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_256_start_dw_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_256_start_dw_%=:\n\t"
         "cmp	x19, #4\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_eor3_256_start_sw_%=\n\t"
         "ldr	w17, [%x[in]], #4\n\t"
         "sub	x19, x19, #4\n\t"
         "str	w17, [x11], #4\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_256_start_sw_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_256_start_sw_%=:\n\t"
         "cmp	x19, #2\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_eor3_256_start_byte_%=\n\t"
         "ldrh	w17, [%x[in]], #2\n\t"
         "sub	x19, x19, #2\n\t"
         "strh	w17, [x11], #2\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_256_start_byte_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_256_start_byte_%=:\n\t"
         "cbz	x19, L_aes_gcm_encrypt_arm64_crypto_eor3_256_end_bytes_%=\n\t"
         "ldrb	w17, [%x[in]], #1\n\t"
         "subs	x19, x19, #1\n\t"
         "strb	w17, [x11], #1\n\t"
         "b.ne	L_aes_gcm_encrypt_arm64_crypto_eor3_256_start_byte_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_256_end_bytes_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_256_end_bytes_%=:\n\t"
         "sub	x11, x11, x14\n\t"
         "ld1	{v16.2d}, [x11]\n\t"
         "add	w15, w15, #1\n\t"
@@ -18396,32 +18402,32 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "sub	x19, x19, #8\n\t"
         "str	x17, [%x[out]], #8\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_256_out_start_dw_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_256_out_start_dw_%=:\n\t"
         "cmp	x19, #4\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_eor3_256_out_start_sw_%=\n\t"
         "ldr	w17, [x11], #4\n\t"
         "sub	x19, x19, #4\n\t"
         "str	w17, [%x[out]], #4\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_256_out_start_sw_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_256_out_start_sw_%=:\n\t"
         "cmp	x19, #2\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_eor3_256_out_start_byte_%=\n\t"
         "ldrh	w17, [x11], #2\n\t"
         "sub	x19, x19, #2\n\t"
         "strh	w17, [%x[out]], #2\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_256_out_start_byte_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_256_out_start_byte_%=:\n\t"
         "cbz	x19, L_aes_gcm_encrypt_arm64_crypto_eor3_256_out_end_bytes_%=\n\t"
         "ldrb	w17, [x11], #1\n\t"
         "subs	x19, x19, #1\n\t"
         "strb	w17, [%x[out]], #1\n\t"
         "b.ne	L_aes_gcm_encrypt_arm64_crypto_eor3_256_out_start_byte_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_256_out_end_bytes_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_256_out_end_bytes_%=:\n\t"
         "mov	x17, #16\n\t"
         "sub	x17, x17, x14\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_256_start_zero_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_256_start_zero_%=:\n\t"
         "subs	x17, x17, #1\n\t"
         "strb	wzr, [x11], #1\n\t"
         "b.ne	L_aes_gcm_encrypt_arm64_crypto_eor3_256_start_zero_%=\n\t"
@@ -18445,7 +18451,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "eor	v26.16b, v28.16b, v30.16b\n\t"
         /* Done GHASH */
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_256_partial_done_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_256_partial_done_%=:\n\t"
         "ld1	{v14.2d}, [x12]\n\t"
         "lsl	x8, x8, #3\n\t"
         "rbit	x8, x8\n\t"
@@ -18506,7 +18512,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "st1	{v26.16b}, [%x[tag]]\n\t"
         "b	L_aes_gcm_encrypt_arm64_crypto_eor3_done_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_256_tag_partial_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_256_tag_partial_%=:\n\t"
         "st1	{v26.16b}, [x11]\n\t"
         "cmp	%w[tagSz], #8\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_eor3_256_tag_start_dw_%=\n\t"
@@ -18514,38 +18520,38 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "sub	%w[tagSz], %w[tagSz], #8\n\t"
         "str	x16, [%x[tag]], #8\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_256_tag_start_dw_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_256_tag_start_dw_%=:\n\t"
         "cmp	%w[tagSz], #4\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_eor3_256_tag_start_sw_%=\n\t"
         "ldr	w16, [x11], #4\n\t"
         "sub	%w[tagSz], %w[tagSz], #4\n\t"
         "str	w16, [%x[tag]], #4\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_256_tag_start_sw_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_256_tag_start_sw_%=:\n\t"
         "cmp	%w[tagSz], #2\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_eor3_256_tag_start_byte_%=\n\t"
         "ldrh	w16, [x11], #2\n\t"
         "sub	%w[tagSz], %w[tagSz], #2\n\t"
         "strh	w16, [%x[tag]], #2\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_256_tag_start_byte_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_256_tag_start_byte_%=:\n\t"
         "cbz	%w[tagSz], L_aes_gcm_encrypt_arm64_crypto_eor3_256_tag_end_bytes_%=\n\t"
         "ldrb	w16, [x11], #1\n\t"
         "subs	%w[tagSz], %w[tagSz], #1\n\t"
         "strb	w16, [%x[tag]], #1\n\t"
         "b.ne	L_aes_gcm_encrypt_arm64_crypto_eor3_256_tag_start_byte_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_256_tag_end_bytes_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_256_tag_end_bytes_%=:\n\t"
 #endif /* !NO_AES_256 */
         "b	L_aes_gcm_encrypt_arm64_crypto_eor3_done_%=\n\t"
         /* AES_GCM_128 */
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_start_128_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_start_128_%=:\n\t"
 #ifndef NO_AES_128
         "cmp	w14, #32\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_eor3_128_start_4_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_128_start_8_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_128_start_8_%=:\n\t"
         "ldr	q12, [x9]\n\t"
         "add	w24, w15, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
@@ -18772,7 +18778,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "cmp	w14, #8\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_eor3_128_end_8_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_128_both_8_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_128_both_8_%=:\n\t"
         "ldr	q12, [x9]\n\t"
         "add	w24, w15, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
@@ -19086,7 +19092,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "cmp	w14, #8\n\t"
         "b.ge	L_aes_gcm_encrypt_arm64_crypto_eor3_128_both_8_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_128_end_8_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_128_end_8_%=:\n\t"
         "rbit	v18.16b, v18.16b\n\t"
         "rbit	v19.16b, v19.16b\n\t"
         "rbit	v20.16b, v20.16b\n\t"
@@ -19175,7 +19181,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "eor	v26.16b, v28.16b, v30.16b\n\t"
         /* Done GHASH */
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_128_start_4_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_128_start_4_%=:\n\t"
         "ld1	{v0.2d, v1.2d, v2.2d, v3.2d}, [x9], #0x40\n\t"
         "ld1	{v4.2d, v5.2d, v6.2d, v7.2d}, [x9], #0x40\n\t"
         "ld1	{v8.2d, v9.2d}, [x9], #32\n\t"
@@ -19294,7 +19300,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "st1	{v18.16b, v19.16b, v20.16b, v21.16b}, [%x[out]], #0x40\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_eor3_128_end_4_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_128_both_4_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_128_both_4_%=:\n\t"
         "add	w20, w15, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
         "add	w19, w15, #2\n\t"
@@ -19448,7 +19454,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "st1	{v18.16b, v19.16b, v20.16b, v21.16b}, [%x[out]], #0x40\n\t"
         "b.ge	L_aes_gcm_encrypt_arm64_crypto_eor3_128_both_4_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_128_end_4_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_128_end_4_%=:\n\t"
         "rbit	v18.16b, v18.16b\n\t"
         "rbit	v19.16b, v19.16b\n\t"
         "rbit	v20.16b, v20.16b\n\t"
@@ -19500,7 +19506,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "b.eq	L_aes_gcm_encrypt_arm64_crypto_eor3_128_start_1_%=\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_eor3_128_done_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_128_start_2_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_128_start_2_%=:\n\t"
         "add	w20, w15, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
         "add	w15, w15, #2\n\t"
@@ -19584,7 +19590,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         /* Done GHASH */
         "cbz	w14, L_aes_gcm_encrypt_arm64_crypto_eor3_128_done_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_128_start_1_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_128_start_1_%=:\n\t"
         "add	w15, w15, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
         "rev	w16, w15\n\t"
@@ -19630,7 +19636,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "eor	v26.16b, v28.16b, v30.16b\n\t"
         /* Done GHASH */
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_128_done_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_128_done_%=:\n\t"
         "ands	w14, %w[sz], #15\n\t"
         "b.eq	L_aes_gcm_encrypt_arm64_crypto_eor3_128_partial_done_%=\n\t"
         "eor	v16.16b, v16.16b, v16.16b\n\t"
@@ -19642,28 +19648,28 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "sub	x19, x19, #8\n\t"
         "str	x17, [x11], #8\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_128_start_dw_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_128_start_dw_%=:\n\t"
         "cmp	x19, #4\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_eor3_128_start_sw_%=\n\t"
         "ldr	w17, [%x[in]], #4\n\t"
         "sub	x19, x19, #4\n\t"
         "str	w17, [x11], #4\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_128_start_sw_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_128_start_sw_%=:\n\t"
         "cmp	x19, #2\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_eor3_128_start_byte_%=\n\t"
         "ldrh	w17, [%x[in]], #2\n\t"
         "sub	x19, x19, #2\n\t"
         "strh	w17, [x11], #2\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_128_start_byte_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_128_start_byte_%=:\n\t"
         "cbz	x19, L_aes_gcm_encrypt_arm64_crypto_eor3_128_end_bytes_%=\n\t"
         "ldrb	w17, [%x[in]], #1\n\t"
         "subs	x19, x19, #1\n\t"
         "strb	w17, [x11], #1\n\t"
         "b.ne	L_aes_gcm_encrypt_arm64_crypto_eor3_128_start_byte_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_128_end_bytes_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_128_end_bytes_%=:\n\t"
         "sub	x11, x11, x14\n\t"
         "ld1	{v16.2d}, [x11]\n\t"
         "add	w15, w15, #1\n\t"
@@ -19699,32 +19705,32 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "sub	x19, x19, #8\n\t"
         "str	x17, [%x[out]], #8\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_128_out_start_dw_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_128_out_start_dw_%=:\n\t"
         "cmp	x19, #4\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_eor3_128_out_start_sw_%=\n\t"
         "ldr	w17, [x11], #4\n\t"
         "sub	x19, x19, #4\n\t"
         "str	w17, [%x[out]], #4\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_128_out_start_sw_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_128_out_start_sw_%=:\n\t"
         "cmp	x19, #2\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_eor3_128_out_start_byte_%=\n\t"
         "ldrh	w17, [x11], #2\n\t"
         "sub	x19, x19, #2\n\t"
         "strh	w17, [%x[out]], #2\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_128_out_start_byte_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_128_out_start_byte_%=:\n\t"
         "cbz	x19, L_aes_gcm_encrypt_arm64_crypto_eor3_128_out_end_bytes_%=\n\t"
         "ldrb	w17, [x11], #1\n\t"
         "subs	x19, x19, #1\n\t"
         "strb	w17, [%x[out]], #1\n\t"
         "b.ne	L_aes_gcm_encrypt_arm64_crypto_eor3_128_out_start_byte_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_128_out_end_bytes_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_128_out_end_bytes_%=:\n\t"
         "mov	x17, #16\n\t"
         "sub	x17, x17, x14\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_128_start_zero_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_128_start_zero_%=:\n\t"
         "subs	x17, x17, #1\n\t"
         "strb	wzr, [x11], #1\n\t"
         "b.ne	L_aes_gcm_encrypt_arm64_crypto_eor3_128_start_zero_%=\n\t"
@@ -19748,7 +19754,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "eor	v26.16b, v28.16b, v30.16b\n\t"
         /* Done GHASH */
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_128_partial_done_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_128_partial_done_%=:\n\t"
         "ld1	{v14.2d}, [x12]\n\t"
         "lsl	x8, x8, #3\n\t"
         "rbit	x8, x8\n\t"
@@ -19797,7 +19803,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "st1	{v26.16b}, [%x[tag]]\n\t"
         "b	L_aes_gcm_encrypt_arm64_crypto_eor3_done_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_128_tag_partial_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_128_tag_partial_%=:\n\t"
         "st1	{v26.16b}, [x11]\n\t"
         "cmp	%w[tagSz], #8\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_eor3_128_tag_start_dw_%=\n\t"
@@ -19805,31 +19811,31 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "sub	%w[tagSz], %w[tagSz], #8\n\t"
         "str	x16, [%x[tag]], #8\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_128_tag_start_dw_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_128_tag_start_dw_%=:\n\t"
         "cmp	%w[tagSz], #4\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_eor3_128_tag_start_sw_%=\n\t"
         "ldr	w16, [x11], #4\n\t"
         "sub	%w[tagSz], %w[tagSz], #4\n\t"
         "str	w16, [%x[tag]], #4\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_128_tag_start_sw_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_128_tag_start_sw_%=:\n\t"
         "cmp	%w[tagSz], #2\n\t"
         "b.lt	L_aes_gcm_encrypt_arm64_crypto_eor3_128_tag_start_byte_%=\n\t"
         "ldrh	w16, [x11], #2\n\t"
         "sub	%w[tagSz], %w[tagSz], #2\n\t"
         "strh	w16, [%x[tag]], #2\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_128_tag_start_byte_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_128_tag_start_byte_%=:\n\t"
         "cbz	%w[tagSz], L_aes_gcm_encrypt_arm64_crypto_eor3_128_tag_end_bytes_%=\n\t"
         "ldrb	w16, [x11], #1\n\t"
         "subs	%w[tagSz], %w[tagSz], #1\n\t"
         "strb	w16, [%x[tag]], #1\n\t"
         "b.ne	L_aes_gcm_encrypt_arm64_crypto_eor3_128_tag_start_byte_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_128_tag_end_bytes_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_128_tag_end_bytes_%=:\n\t"
 #endif /* !NO_AES_128 */
         "\n"
-    "L_aes_gcm_encrypt_arm64_crypto_eor3_done_%=: \n\t"
+    "L_aes_gcm_encrypt_arm64_crypto_eor3_done_%=:\n\t"
         "ldp	x29, x30, [sp], #0x50\n\t"
         : [out] "+r" (out), [sz] "+r" (sz), [nonceSz] "+r" (nonceSz),
           [tag] "+r" (tag), [tagSz] "+r" (tagSz), [aadSz] "+r" (aadSz),
@@ -19962,7 +19968,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "eor	v7.16b, v30.16b, v31.16b\n\t"
         /* Done */
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_h_done_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_h_done_%=:\n\t"
         "lsr	w14, w8, #4\n\t"
         "cmp	w14, #4\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_eor3_aad_start_1_%=\n\t"
@@ -19971,7 +19977,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "cmp	w14, #0x40\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_eor3_aad_start_4_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_aad_start_8_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_aad_start_8_%=:\n\t"
         "ld1	{v18.16b, v19.16b, v20.16b, v21.16b}, [%x[aad]], #0x40\n\t"
         "ld1	{v0.16b, v1.16b, v2.16b, v3.16b}, [%x[aad]], #0x40\n\t"
         "rbit	v18.16b, v18.16b\n\t"
@@ -20070,7 +20076,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "cmp	w14, #16\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_eor3_aad_start_2_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_aad_start_4_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_aad_start_4_%=:\n\t"
         "ld1	{v18.16b, v19.16b, v20.16b, v21.16b}, [%x[aad]], #0x40\n\t"
         "rbit	v18.16b, v18.16b\n\t"
         "rbit	v19.16b, v19.16b\n\t"
@@ -20126,7 +20132,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_eor3_aad_done_%=\n\t"
         "b.eq	L_aes_gcm_decrypt_arm64_crypto_eor3_aad_start_1_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_aad_start_2_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_aad_start_2_%=:\n\t"
         "ld1	{v18.16b, v19.16b}, [%x[aad]], #32\n\t"
         "rbit	v18.16b, v18.16b\n\t"
         "rbit	v19.16b, v19.16b\n\t"
@@ -20160,10 +20166,10 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "b.gt	L_aes_gcm_decrypt_arm64_crypto_eor3_aad_start_2_%=\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_eor3_aad_done_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_aad_start_1_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_aad_start_1_%=:\n\t"
         "cbz	w14, L_aes_gcm_decrypt_arm64_crypto_eor3_aad_done_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_aad_both_1_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_aad_both_1_%=:\n\t"
         "ld1	{v18.16b}, [%x[aad]], #16\n\t"
         "rbit	v18.16b, v18.16b\n\t"
         "eor	v21.16b, v26.16b, v18.16b\n\t"
@@ -20185,7 +20191,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "subs	w14, w14, #1\n\t"
         "b.ne	L_aes_gcm_decrypt_arm64_crypto_eor3_aad_both_1_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_aad_done_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_aad_done_%=:\n\t"
         "and	w14, w8, #15\n\t"
         "cbz	w14, L_aes_gcm_decrypt_arm64_crypto_eor3_aad_partial_done_%=\n\t"
         "eor	v28.16b, v28.16b, v28.16b\n\t"
@@ -20197,28 +20203,28 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "sub	w20, w20, #8\n\t"
         "str	x19, [x11], #8\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_aad_start_dw_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_aad_start_dw_%=:\n\t"
         "cmp	w20, #4\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_eor3_aad_start_sw_%=\n\t"
         "ldr	w19, [%x[aad]], #4\n\t"
         "sub	w20, w20, #4\n\t"
         "str	w19, [x11], #4\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_aad_start_sw_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_aad_start_sw_%=:\n\t"
         "cmp	w20, #2\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_eor3_aad_start_byte_%=\n\t"
         "ldrh	w19, [%x[aad]], #2\n\t"
         "sub	w20, w20, #2\n\t"
         "strh	w19, [x11], #2\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_aad_start_byte_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_aad_start_byte_%=:\n\t"
         "cbz	w20, L_aes_gcm_decrypt_arm64_crypto_eor3_aad_end_bytes_%=\n\t"
         "ldrb	w19, [%x[aad]], #1\n\t"
         "subs	w20, w20, #1\n\t"
         "strb	w19, [x11], #1\n\t"
         "b.ne	L_aes_gcm_decrypt_arm64_crypto_eor3_aad_start_byte_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_aad_end_bytes_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_aad_end_bytes_%=:\n\t"
         "sub	x11, x11, x14\n\t"
         "ld1	{v18.2d}, [x11]\n\t"
         "rbit	v18.16b, v18.16b\n\t"
@@ -20239,7 +20245,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "eor	v26.16b, v28.16b, v30.16b\n\t"
         /* Done GHASH */
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_aad_partial_done_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_aad_partial_done_%=:\n\t"
         /* Load Nonce */
         "cmp	%w[nonceSz], #12\n\t"
         "b.ne	L_aes_gcm_decrypt_arm64_crypto_eor3_ghash_nonce_%=\n\t"
@@ -20251,12 +20257,12 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "mov	w15, #1\n\t"
         "b	L_aes_gcm_decrypt_arm64_crypto_eor3_done_nonce_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_ghash_nonce_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_ghash_nonce_%=:\n\t"
         "eor	v13.16b, v13.16b, v13.16b\n\t"
         "lsr	w14, %w[nonceSz], #4\n\t"
         "cbz	w14, L_aes_gcm_decrypt_arm64_crypto_eor3_nonce_done_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_nonce_start_1_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_nonce_start_1_%=:\n\t"
         "ld1	{v18.16b}, [%x[nonce]], #16\n\t"
         "rbit	v18.16b, v18.16b\n\t"
         "eor	v21.16b, v13.16b, v18.16b\n\t"
@@ -20278,7 +20284,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "subs	w14, w14, #1\n\t"
         "b.ne	L_aes_gcm_decrypt_arm64_crypto_eor3_nonce_start_1_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_nonce_done_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_nonce_done_%=:\n\t"
         "and	w24, %w[nonceSz], #15\n\t"
         "cbz	x24, L_aes_gcm_decrypt_arm64_crypto_eor3_nonce_partial_done_%=\n\t"
         "eor	v28.16b, v28.16b, v28.16b\n\t"
@@ -20290,28 +20296,28 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "sub	w20, w20, #8\n\t"
         "str	x19, [x11], #8\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_nonce_start_dw_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_nonce_start_dw_%=:\n\t"
         "cmp	w20, #4\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_eor3_nonce_start_sw_%=\n\t"
         "ldr	w19, [%x[nonce]], #4\n\t"
         "sub	w20, w20, #4\n\t"
         "str	w19, [x11], #4\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_nonce_start_sw_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_nonce_start_sw_%=:\n\t"
         "cmp	w20, #2\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_eor3_nonce_start_byte_%=\n\t"
         "ldrh	w19, [%x[nonce]], #2\n\t"
         "sub	w20, w20, #2\n\t"
         "strh	w19, [x11], #2\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_nonce_start_byte_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_nonce_start_byte_%=:\n\t"
         "cbz	w20, L_aes_gcm_decrypt_arm64_crypto_eor3_nonce_end_bytes_%=\n\t"
         "ldrb	w19, [%x[nonce]], #1\n\t"
         "subs	w20, w20, #1\n\t"
         "strb	w19, [x11], #1\n\t"
         "b.ne	L_aes_gcm_decrypt_arm64_crypto_eor3_nonce_start_byte_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_nonce_end_bytes_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_nonce_end_bytes_%=:\n\t"
         "sub	x11, x11, x24\n\t"
         "ld1	{v18.2d}, [x11]\n\t"
         "rbit	v18.16b, v18.16b\n\t"
@@ -20332,7 +20338,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "eor	v13.16b, v28.16b, v30.16b\n\t"
         /* Done GHASH */
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_nonce_partial_done_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_nonce_partial_done_%=:\n\t"
         "eor	x14, x14, x14\n\t"
         "lsl	x24, %x[nonceSz], #3\n\t"
         "mov	v28.d[0], x14\n\t"
@@ -20356,7 +20362,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "mov	w15, v13.s[3]\n\t"
         "rev	w15, w15\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_done_nonce_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_done_nonce_%=:\n\t"
         "st1	{v13.2d}, [x12]\n\t"
         "lsr	w14, %w[sz], #4\n\t"
         "cmp	w13, #12\n\t"
@@ -20367,7 +20373,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "cmp	w14, #32\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_eor3_192_start_4_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_192_start_8_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_192_start_8_%=:\n\t"
         "ldr	q12, [x9]\n\t"
         "add	w24, w15, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
@@ -20628,7 +20634,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "cmp	w14, #8\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_eor3_192_end_8_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_192_both_8_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_192_both_8_%=:\n\t"
         "ldr	q12, [x9]\n\t"
         "add	w24, w15, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
@@ -20976,7 +20982,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "cmp	w14, #8\n\t"
         "b.ge	L_aes_gcm_decrypt_arm64_crypto_eor3_192_both_8_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_192_end_8_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_192_end_8_%=:\n\t"
         "rbit	v18.16b, v18.16b\n\t"
         "rbit	v19.16b, v19.16b\n\t"
         "rbit	v20.16b, v20.16b\n\t"
@@ -21065,7 +21071,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "eor	v26.16b, v28.16b, v30.16b\n\t"
         /* Done GHASH */
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_192_start_4_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_192_start_4_%=:\n\t"
         "ld1	{v0.2d, v1.2d, v2.2d, v3.2d}, [x9], #0x40\n\t"
         "ld1	{v4.2d, v5.2d, v6.2d, v7.2d}, [x9], #0x40\n\t"
         "ld1	{v8.2d, v9.2d, v10.2d, v11.2d}, [x9], #0x40\n\t"
@@ -21200,7 +21206,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "st1	{v14.16b, v15.16b, v16.16b, v17.16b}, [%x[out]], #0x40\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_eor3_192_end_4_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_192_both_4_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_192_both_4_%=:\n\t"
         "add	w20, w15, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
         "add	w19, w15, #2\n\t"
@@ -21370,7 +21376,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "st1	{v14.16b, v15.16b, v16.16b, v17.16b}, [%x[out]], #0x40\n\t"
         "b.ge	L_aes_gcm_decrypt_arm64_crypto_eor3_192_both_4_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_192_end_4_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_192_end_4_%=:\n\t"
         "rbit	v18.16b, v18.16b\n\t"
         "rbit	v19.16b, v19.16b\n\t"
         "rbit	v20.16b, v20.16b\n\t"
@@ -21422,7 +21428,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "b.eq	L_aes_gcm_decrypt_arm64_crypto_eor3_192_start_1_%=\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_eor3_192_done_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_192_start_2_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_192_start_2_%=:\n\t"
         "add	w20, w15, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
         "add	w15, w15, #2\n\t"
@@ -21514,7 +21520,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         /* Done GHASH */
         "cbz	w14, L_aes_gcm_decrypt_arm64_crypto_eor3_192_done_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_192_start_1_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_192_start_1_%=:\n\t"
         "ld1	{v15.16b}, [%x[in]], #16\n\t"
         "add	w15, w15, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
@@ -21565,7 +21571,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "eor	v14.16b, v14.16b, v15.16b\n\t"
         "st1	{v14.16b}, [%x[out]], #16\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_192_done_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_192_done_%=:\n\t"
         "ands	w14, %w[sz], #15\n\t"
         "b.eq	L_aes_gcm_decrypt_arm64_crypto_eor3_192_partial_done_%=\n\t"
         "eor	v15.16b, v15.16b, v15.16b\n\t"
@@ -21577,28 +21583,28 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "sub	x19, x19, #8\n\t"
         "str	x17, [x11], #8\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_192_start_dw_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_192_start_dw_%=:\n\t"
         "cmp	x19, #4\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_eor3_192_start_sw_%=\n\t"
         "ldr	w17, [%x[in]], #4\n\t"
         "sub	x19, x19, #4\n\t"
         "str	w17, [x11], #4\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_192_start_sw_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_192_start_sw_%=:\n\t"
         "cmp	x19, #2\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_eor3_192_start_byte_%=\n\t"
         "ldrh	w17, [%x[in]], #2\n\t"
         "sub	x19, x19, #2\n\t"
         "strh	w17, [x11], #2\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_192_start_byte_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_192_start_byte_%=:\n\t"
         "cbz	x19, L_aes_gcm_decrypt_arm64_crypto_eor3_192_end_bytes_%=\n\t"
         "ldrb	w17, [%x[in]], #1\n\t"
         "subs	x19, x19, #1\n\t"
         "strb	w17, [x11], #1\n\t"
         "b.ne	L_aes_gcm_decrypt_arm64_crypto_eor3_192_start_byte_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_192_end_bytes_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_192_end_bytes_%=:\n\t"
         "sub	x11, x11, x14\n\t"
         "ld1	{v15.2d}, [x11]\n\t"
         "add	w15, w15, #1\n\t"
@@ -21655,30 +21661,30 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "sub	w14, w14, #8\n\t"
         "str	x17, [%x[out]], #8\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_192_out_start_dw_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_192_out_start_dw_%=:\n\t"
         "cmp	w14, #4\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_eor3_192_out_start_sw_%=\n\t"
         "ldr	w17, [x11], #4\n\t"
         "sub	w14, w14, #4\n\t"
         "str	w17, [%x[out]], #4\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_192_out_start_sw_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_192_out_start_sw_%=:\n\t"
         "cmp	w14, #2\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_eor3_192_out_start_byte_%=\n\t"
         "ldrh	w17, [x11], #2\n\t"
         "sub	w14, w14, #2\n\t"
         "strh	w17, [%x[out]], #2\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_192_out_start_byte_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_192_out_start_byte_%=:\n\t"
         "cbz	w14, L_aes_gcm_decrypt_arm64_crypto_eor3_192_out_end_bytes_%=\n\t"
         "ldrb	w17, [x11], #1\n\t"
         "subs	w14, w14, #1\n\t"
         "strb	w17, [%x[out]], #1\n\t"
         "b.ne	L_aes_gcm_decrypt_arm64_crypto_eor3_192_out_start_byte_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_192_out_end_bytes_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_192_out_end_bytes_%=:\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_192_partial_done_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_192_partial_done_%=:\n\t"
         "ld1	{v14.2d}, [x12]\n\t"
         "lsl	x8, x8, #3\n\t"
         "rbit	x8, x8\n\t"
@@ -21731,7 +21737,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "ld1	{v28.16b}, [%x[tag]]\n\t"
         "b	L_aes_gcm_decrypt_arm64_crypto_eor3_192_tag_loaded_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_192_part_tag_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_192_part_tag_%=:\n\t"
         "eor	v28.16b, v28.16b, v28.16b\n\t"
         "mov	x17, %x[tagSz]\n\t"
         "st1	{v28.2d}, [x11]\n\t"
@@ -21741,28 +21747,28 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "sub	x17, x17, #8\n\t"
         "str	x16, [x11], #8\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_192_tag_start_dw_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_192_tag_start_dw_%=:\n\t"
         "cmp	x17, #4\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_eor3_192_tag_start_sw_%=\n\t"
         "ldr	w16, [%x[tag]], #4\n\t"
         "sub	x17, x17, #4\n\t"
         "str	w16, [x11], #4\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_192_tag_start_sw_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_192_tag_start_sw_%=:\n\t"
         "cmp	x17, #2\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_eor3_192_tag_start_byte_%=\n\t"
         "ldrh	w16, [%x[tag]], #2\n\t"
         "sub	x17, x17, #2\n\t"
         "strh	w16, [x11], #2\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_192_tag_start_byte_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_192_tag_start_byte_%=:\n\t"
         "cbz	x17, L_aes_gcm_decrypt_arm64_crypto_eor3_192_tag_end_bytes_%=\n\t"
         "ldrb	w16, [%x[tag]], #1\n\t"
         "subs	x17, x17, #1\n\t"
         "strb	w16, [x11], #1\n\t"
         "b.ne	L_aes_gcm_decrypt_arm64_crypto_eor3_192_tag_start_byte_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_192_tag_end_bytes_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_192_tag_end_bytes_%=:\n\t"
         "sub	x11, x11, %x[tagSz]\n\t"
         "ld1	{v28.2d}, [x11]\n\t"
         "mov	x17, #16\n\t"
@@ -21770,14 +21776,14 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "sub	x17, x17, %x[tagSz]\n\t"
         "add	x11, x11, %x[tagSz]\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_192_calc_tag_byte_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_192_calc_tag_byte_%=:\n\t"
         "strb	wzr, [x11], #1\n\t"
         "subs	x17, x17, #1\n\t"
         "b.ne	L_aes_gcm_decrypt_arm64_crypto_eor3_192_calc_tag_byte_%=\n\t"
         "subs	x11, x11, #16\n\t"
         "ld1	{v26.2d}, [x11]\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_192_tag_loaded_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_192_tag_loaded_%=:\n\t"
         "eor	v28.16b, v28.16b, v26.16b\n\t"
         "mov	x16, v28.d[0]\n\t"
         "mov	x17, v28.d[1]\n\t"
@@ -21790,12 +21796,12 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "b	L_aes_gcm_decrypt_arm64_crypto_eor3_done_%=\n\t"
         /* AES_GCM_256 */
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_start_256_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_start_256_%=:\n\t"
 #ifndef NO_AES_256
         "cmp	w14, #32\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_eor3_256_start_4_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_256_start_8_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_256_start_8_%=:\n\t"
         "ldr	q12, [x9]\n\t"
         "add	w24, w15, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
@@ -22090,7 +22096,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "cmp	w14, #8\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_eor3_256_end_8_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_256_both_8_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_256_both_8_%=:\n\t"
         "ldr	q12, [x9]\n\t"
         "add	w24, w15, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
@@ -22472,7 +22478,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "cmp	w14, #8\n\t"
         "b.ge	L_aes_gcm_decrypt_arm64_crypto_eor3_256_both_8_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_256_end_8_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_256_end_8_%=:\n\t"
         "rbit	v18.16b, v18.16b\n\t"
         "rbit	v19.16b, v19.16b\n\t"
         "rbit	v20.16b, v20.16b\n\t"
@@ -22561,7 +22567,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "eor	v26.16b, v28.16b, v30.16b\n\t"
         /* Done GHASH */
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_256_start_4_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_256_start_4_%=:\n\t"
         "ld1	{v0.2d, v1.2d, v2.2d, v3.2d}, [x9], #0x40\n\t"
         "ld1	{v4.2d, v5.2d, v6.2d, v7.2d}, [x9], #0x40\n\t"
         "ld1	{v8.2d, v9.2d, v10.2d, v11.2d}, [x9], #0x40\n\t"
@@ -22713,7 +22719,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "st1	{v14.16b, v15.16b, v16.16b, v17.16b}, [%x[out]], #0x40\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_eor3_256_end_4_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_256_both_4_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_256_both_4_%=:\n\t"
         "add	w20, w15, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
         "add	w19, w15, #2\n\t"
@@ -22900,7 +22906,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "st1	{v14.16b, v15.16b, v16.16b, v17.16b}, [%x[out]], #0x40\n\t"
         "b.ge	L_aes_gcm_decrypt_arm64_crypto_eor3_256_both_4_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_256_end_4_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_256_end_4_%=:\n\t"
         "rbit	v18.16b, v18.16b\n\t"
         "rbit	v19.16b, v19.16b\n\t"
         "rbit	v20.16b, v20.16b\n\t"
@@ -22952,7 +22958,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "b.eq	L_aes_gcm_decrypt_arm64_crypto_eor3_256_start_1_%=\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_eor3_256_done_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_256_start_2_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_256_start_2_%=:\n\t"
         "add	w20, w15, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
         "add	w15, w15, #2\n\t"
@@ -23053,7 +23059,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         /* Done GHASH */
         "cbz	w14, L_aes_gcm_decrypt_arm64_crypto_eor3_256_done_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_256_start_1_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_256_start_1_%=:\n\t"
         "add	w15, w15, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
         "rev	w16, w15\n\t"
@@ -23109,7 +23115,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "eor	v26.16b, v28.16b, v30.16b\n\t"
         /* Done GHASH */
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_256_done_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_256_done_%=:\n\t"
         "ands	w14, %w[sz], #15\n\t"
         "b.eq	L_aes_gcm_decrypt_arm64_crypto_eor3_256_partial_done_%=\n\t"
         "eor	v15.16b, v15.16b, v15.16b\n\t"
@@ -23121,28 +23127,28 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "sub	x19, x19, #8\n\t"
         "str	x17, [x11], #8\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_256_start_dw_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_256_start_dw_%=:\n\t"
         "cmp	x19, #4\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_eor3_256_start_sw_%=\n\t"
         "ldr	w17, [%x[in]], #4\n\t"
         "sub	x19, x19, #4\n\t"
         "str	w17, [x11], #4\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_256_start_sw_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_256_start_sw_%=:\n\t"
         "cmp	x19, #2\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_eor3_256_start_byte_%=\n\t"
         "ldrh	w17, [%x[in]], #2\n\t"
         "sub	x19, x19, #2\n\t"
         "strh	w17, [x11], #2\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_256_start_byte_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_256_start_byte_%=:\n\t"
         "cbz	x19, L_aes_gcm_decrypt_arm64_crypto_eor3_256_end_bytes_%=\n\t"
         "ldrb	w17, [%x[in]], #1\n\t"
         "subs	x19, x19, #1\n\t"
         "strb	w17, [x11], #1\n\t"
         "b.ne	L_aes_gcm_decrypt_arm64_crypto_eor3_256_start_byte_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_256_end_bytes_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_256_end_bytes_%=:\n\t"
         "sub	x11, x11, x14\n\t"
         "ld1	{v15.2d}, [x11]\n\t"
         "add	w15, w15, #1\n\t"
@@ -23205,30 +23211,30 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "sub	w14, w14, #8\n\t"
         "str	x17, [%x[out]], #8\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_256_out_start_dw_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_256_out_start_dw_%=:\n\t"
         "cmp	w14, #4\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_eor3_256_out_start_sw_%=\n\t"
         "ldr	w17, [x11], #4\n\t"
         "sub	w14, w14, #4\n\t"
         "str	w17, [%x[out]], #4\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_256_out_start_sw_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_256_out_start_sw_%=:\n\t"
         "cmp	w14, #2\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_eor3_256_out_start_byte_%=\n\t"
         "ldrh	w17, [x11], #2\n\t"
         "sub	w14, w14, #2\n\t"
         "strh	w17, [%x[out]], #2\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_256_out_start_byte_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_256_out_start_byte_%=:\n\t"
         "cbz	w14, L_aes_gcm_decrypt_arm64_crypto_eor3_256_out_end_bytes_%=\n\t"
         "ldrb	w17, [x11], #1\n\t"
         "subs	w14, w14, #1\n\t"
         "strb	w17, [%x[out]], #1\n\t"
         "b.ne	L_aes_gcm_decrypt_arm64_crypto_eor3_256_out_start_byte_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_256_out_end_bytes_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_256_out_end_bytes_%=:\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_256_partial_done_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_256_partial_done_%=:\n\t"
         "ld1	{v14.2d}, [x12]\n\t"
         "lsl	x8, x8, #3\n\t"
         "rbit	x8, x8\n\t"
@@ -23289,7 +23295,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "ld1	{v28.16b}, [%x[tag]]\n\t"
         "b	L_aes_gcm_decrypt_arm64_crypto_eor3_256_tag_loaded_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_256_part_tag_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_256_part_tag_%=:\n\t"
         "eor	v28.16b, v28.16b, v28.16b\n\t"
         "mov	x17, %x[tagSz]\n\t"
         "st1	{v28.2d}, [x11]\n\t"
@@ -23299,28 +23305,28 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "sub	x17, x17, #8\n\t"
         "str	x16, [x11], #8\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_256_tag_start_dw_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_256_tag_start_dw_%=:\n\t"
         "cmp	x17, #4\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_eor3_256_tag_start_sw_%=\n\t"
         "ldr	w16, [%x[tag]], #4\n\t"
         "sub	x17, x17, #4\n\t"
         "str	w16, [x11], #4\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_256_tag_start_sw_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_256_tag_start_sw_%=:\n\t"
         "cmp	x17, #2\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_eor3_256_tag_start_byte_%=\n\t"
         "ldrh	w16, [%x[tag]], #2\n\t"
         "sub	x17, x17, #2\n\t"
         "strh	w16, [x11], #2\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_256_tag_start_byte_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_256_tag_start_byte_%=:\n\t"
         "cbz	x17, L_aes_gcm_decrypt_arm64_crypto_eor3_256_tag_end_bytes_%=\n\t"
         "ldrb	w16, [%x[tag]], #1\n\t"
         "subs	x17, x17, #1\n\t"
         "strb	w16, [x11], #1\n\t"
         "b.ne	L_aes_gcm_decrypt_arm64_crypto_eor3_256_tag_start_byte_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_256_tag_end_bytes_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_256_tag_end_bytes_%=:\n\t"
         "sub	x11, x11, %x[tagSz]\n\t"
         "ld1	{v28.2d}, [x11]\n\t"
         "mov	x17, #16\n\t"
@@ -23328,14 +23334,14 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "sub	x17, x17, %x[tagSz]\n\t"
         "add	x11, x11, %x[tagSz]\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_256_calc_tag_byte_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_256_calc_tag_byte_%=:\n\t"
         "strb	wzr, [x11], #1\n\t"
         "subs	x17, x17, #1\n\t"
         "b.ne	L_aes_gcm_decrypt_arm64_crypto_eor3_256_calc_tag_byte_%=\n\t"
         "subs	x11, x11, #16\n\t"
         "ld1	{v26.2d}, [x11]\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_256_tag_loaded_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_256_tag_loaded_%=:\n\t"
         "eor	v28.16b, v28.16b, v26.16b\n\t"
         "mov	x16, v28.d[0]\n\t"
         "mov	x17, v28.d[1]\n\t"
@@ -23348,12 +23354,12 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "b	L_aes_gcm_decrypt_arm64_crypto_eor3_done_%=\n\t"
         /* AES_GCM_128 */
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_start_128_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_start_128_%=:\n\t"
 #ifndef NO_AES_128
         "cmp	w14, #32\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_eor3_128_start_4_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_128_start_8_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_128_start_8_%=:\n\t"
         "ldr	q12, [x9]\n\t"
         "add	w24, w15, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
@@ -23580,7 +23586,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "cmp	w14, #8\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_eor3_128_end_8_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_128_both_8_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_128_both_8_%=:\n\t"
         "ldr	q12, [x9]\n\t"
         "add	w24, w15, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
@@ -23894,7 +23900,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "cmp	w14, #8\n\t"
         "b.ge	L_aes_gcm_decrypt_arm64_crypto_eor3_128_both_8_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_128_end_8_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_128_end_8_%=:\n\t"
         "rbit	v18.16b, v18.16b\n\t"
         "rbit	v19.16b, v19.16b\n\t"
         "rbit	v20.16b, v20.16b\n\t"
@@ -23983,7 +23989,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "eor	v26.16b, v28.16b, v30.16b\n\t"
         /* Done GHASH */
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_128_start_4_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_128_start_4_%=:\n\t"
         "ld1	{v0.2d, v1.2d, v2.2d, v3.2d}, [x9], #0x40\n\t"
         "ld1	{v4.2d, v5.2d, v6.2d, v7.2d}, [x9], #0x40\n\t"
         "ld1	{v8.2d, v9.2d}, [x9], #32\n\t"
@@ -24102,7 +24108,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "st1	{v14.16b, v15.16b, v16.16b, v17.16b}, [%x[out]], #0x40\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_eor3_128_end_4_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_128_both_4_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_128_both_4_%=:\n\t"
         "add	w20, w15, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
         "add	w19, w15, #2\n\t"
@@ -24256,7 +24262,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "st1	{v14.16b, v15.16b, v16.16b, v17.16b}, [%x[out]], #0x40\n\t"
         "b.ge	L_aes_gcm_decrypt_arm64_crypto_eor3_128_both_4_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_128_end_4_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_128_end_4_%=:\n\t"
         "rbit	v18.16b, v18.16b\n\t"
         "rbit	v19.16b, v19.16b\n\t"
         "rbit	v20.16b, v20.16b\n\t"
@@ -24308,7 +24314,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "b.eq	L_aes_gcm_decrypt_arm64_crypto_eor3_128_start_1_%=\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_eor3_128_done_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_128_start_2_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_128_start_2_%=:\n\t"
         "add	w20, w15, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
         "add	w15, w15, #2\n\t"
@@ -24392,7 +24398,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         /* Done GHASH */
         "cbz	w14, L_aes_gcm_decrypt_arm64_crypto_eor3_128_done_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_128_start_1_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_128_start_1_%=:\n\t"
         "add	w15, w15, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
         "rev	w16, w15\n\t"
@@ -24438,7 +24444,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "eor	v26.16b, v28.16b, v30.16b\n\t"
         /* Done GHASH */
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_128_done_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_128_done_%=:\n\t"
         "ands	w14, %w[sz], #15\n\t"
         "b.eq	L_aes_gcm_decrypt_arm64_crypto_eor3_128_partial_done_%=\n\t"
         "eor	v15.16b, v15.16b, v15.16b\n\t"
@@ -24450,28 +24456,28 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "sub	x19, x19, #8\n\t"
         "str	x17, [x11], #8\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_128_start_dw_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_128_start_dw_%=:\n\t"
         "cmp	x19, #4\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_eor3_128_start_sw_%=\n\t"
         "ldr	w17, [%x[in]], #4\n\t"
         "sub	x19, x19, #4\n\t"
         "str	w17, [x11], #4\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_128_start_sw_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_128_start_sw_%=:\n\t"
         "cmp	x19, #2\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_eor3_128_start_byte_%=\n\t"
         "ldrh	w17, [%x[in]], #2\n\t"
         "sub	x19, x19, #2\n\t"
         "strh	w17, [x11], #2\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_128_start_byte_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_128_start_byte_%=:\n\t"
         "cbz	x19, L_aes_gcm_decrypt_arm64_crypto_eor3_128_end_bytes_%=\n\t"
         "ldrb	w17, [%x[in]], #1\n\t"
         "subs	x19, x19, #1\n\t"
         "strb	w17, [x11], #1\n\t"
         "b.ne	L_aes_gcm_decrypt_arm64_crypto_eor3_128_start_byte_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_128_end_bytes_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_128_end_bytes_%=:\n\t"
         "sub	x11, x11, x14\n\t"
         "ld1	{v15.2d}, [x11]\n\t"
         "add	w15, w15, #1\n\t"
@@ -24524,30 +24530,30 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "sub	w14, w14, #8\n\t"
         "str	x17, [%x[out]], #8\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_128_out_start_dw_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_128_out_start_dw_%=:\n\t"
         "cmp	w14, #4\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_eor3_128_out_start_sw_%=\n\t"
         "ldr	w17, [x11], #4\n\t"
         "sub	w14, w14, #4\n\t"
         "str	w17, [%x[out]], #4\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_128_out_start_sw_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_128_out_start_sw_%=:\n\t"
         "cmp	w14, #2\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_eor3_128_out_start_byte_%=\n\t"
         "ldrh	w17, [x11], #2\n\t"
         "sub	w14, w14, #2\n\t"
         "strh	w17, [%x[out]], #2\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_128_out_start_byte_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_128_out_start_byte_%=:\n\t"
         "cbz	w14, L_aes_gcm_decrypt_arm64_crypto_eor3_128_out_end_bytes_%=\n\t"
         "ldrb	w17, [x11], #1\n\t"
         "subs	w14, w14, #1\n\t"
         "strb	w17, [%x[out]], #1\n\t"
         "b.ne	L_aes_gcm_decrypt_arm64_crypto_eor3_128_out_start_byte_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_128_out_end_bytes_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_128_out_end_bytes_%=:\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_128_partial_done_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_128_partial_done_%=:\n\t"
         "ld1	{v14.2d}, [x12]\n\t"
         "lsl	x8, x8, #3\n\t"
         "rbit	x8, x8\n\t"
@@ -24596,7 +24602,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "ld1	{v28.16b}, [%x[tag]]\n\t"
         "b	L_aes_gcm_decrypt_arm64_crypto_eor3_128_tag_loaded_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_128_part_tag_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_128_part_tag_%=:\n\t"
         "eor	v28.16b, v28.16b, v28.16b\n\t"
         "mov	x17, %x[tagSz]\n\t"
         "st1	{v28.2d}, [x11]\n\t"
@@ -24606,28 +24612,28 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "sub	x17, x17, #8\n\t"
         "str	x16, [x11], #8\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_128_tag_start_dw_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_128_tag_start_dw_%=:\n\t"
         "cmp	x17, #4\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_eor3_128_tag_start_sw_%=\n\t"
         "ldr	w16, [%x[tag]], #4\n\t"
         "sub	x17, x17, #4\n\t"
         "str	w16, [x11], #4\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_128_tag_start_sw_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_128_tag_start_sw_%=:\n\t"
         "cmp	x17, #2\n\t"
         "b.lt	L_aes_gcm_decrypt_arm64_crypto_eor3_128_tag_start_byte_%=\n\t"
         "ldrh	w16, [%x[tag]], #2\n\t"
         "sub	x17, x17, #2\n\t"
         "strh	w16, [x11], #2\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_128_tag_start_byte_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_128_tag_start_byte_%=:\n\t"
         "cbz	x17, L_aes_gcm_decrypt_arm64_crypto_eor3_128_tag_end_bytes_%=\n\t"
         "ldrb	w16, [%x[tag]], #1\n\t"
         "subs	x17, x17, #1\n\t"
         "strb	w16, [x11], #1\n\t"
         "b.ne	L_aes_gcm_decrypt_arm64_crypto_eor3_128_tag_start_byte_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_128_tag_end_bytes_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_128_tag_end_bytes_%=:\n\t"
         "sub	x11, x11, %x[tagSz]\n\t"
         "ld1	{v28.2d}, [x11]\n\t"
         "mov	x17, #16\n\t"
@@ -24635,14 +24641,14 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "sub	x17, x17, %x[tagSz]\n\t"
         "add	x11, x11, %x[tagSz]\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_128_calc_tag_byte_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_128_calc_tag_byte_%=:\n\t"
         "strb	wzr, [x11], #1\n\t"
         "subs	x17, x17, #1\n\t"
         "b.ne	L_aes_gcm_decrypt_arm64_crypto_eor3_128_calc_tag_byte_%=\n\t"
         "subs	x11, x11, #16\n\t"
         "ld1	{v26.2d}, [x11]\n\t"
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_128_tag_loaded_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_128_tag_loaded_%=:\n\t"
         "eor	v28.16b, v28.16b, v26.16b\n\t"
         "mov	x16, v28.d[0]\n\t"
         "mov	x17, v28.d[1]\n\t"
@@ -24653,7 +24659,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz,
         "and	%x[in], %x[in], x19\n\t"
 #endif /* !NO_AES_128 */
         "\n"
-    "L_aes_gcm_decrypt_arm64_crypto_eor3_done_%=: \n\t"
+    "L_aes_gcm_decrypt_arm64_crypto_eor3_done_%=:\n\t"
         "ldp	x29, x30, [sp], #0x50\n\t"
         : [out] "+r" (out), [sz] "+r" (sz), [nonceSz] "+r" (nonceSz),
           [tagSz] "+r" (tagSz), [aadSz] "+r" (aadSz), [key] "+r" (key),
@@ -24690,12 +24696,12 @@ void AES_GCM_init_AARCH64(byte* key, int nr, const byte* nonce, word32 nonceSz,
         "mov	w8, #1\n\t"
         "b	L_aes_gcm_init_arm64_crypto_done_nonce_%=\n\t"
         "\n"
-    "L_aes_gcm_init_arm64_crypto_ghash_nonce_%=: \n\t"
+    "L_aes_gcm_init_arm64_crypto_ghash_nonce_%=:\n\t"
         "eor	v4.16b, v4.16b, v4.16b\n\t"
         "lsr	w7, %w[nonceSz], #4\n\t"
         "cbz	w7, L_aes_gcm_init_arm64_crypto_done_%=\n\t"
         "\n"
-    "L_aes_gcm_init_arm64_crypto_start_1_%=: \n\t"
+    "L_aes_gcm_init_arm64_crypto_start_1_%=:\n\t"
         "ld1	{v0.16b}, [%x[nonce]], #16\n\t"
         "rbit	v0.16b, v0.16b\n\t"
         "eor	v3.16b, v4.16b, v0.16b\n\t"
@@ -24718,7 +24724,7 @@ void AES_GCM_init_AARCH64(byte* key, int nr, const byte* nonce, word32 nonceSz,
         "subs	w7, w7, #1\n\t"
         "b.ne	L_aes_gcm_init_arm64_crypto_start_1_%=\n\t"
         "\n"
-    "L_aes_gcm_init_arm64_crypto_done_%=: \n\t"
+    "L_aes_gcm_init_arm64_crypto_done_%=:\n\t"
         "and	w13, %w[nonceSz], #15\n\t"
         "cbz	x13, L_aes_gcm_init_arm64_crypto_partial_done_%=\n\t"
         "eor	v7.16b, v7.16b, v7.16b\n\t"
@@ -24730,28 +24736,28 @@ void AES_GCM_init_AARCH64(byte* key, int nr, const byte* nonce, word32 nonceSz,
         "sub	w12, w12, #8\n\t"
         "str	x11, [%x[initCtr]], #8\n\t"
         "\n"
-    "L_aes_gcm_init_arm64_crypto_start_dw_%=: \n\t"
+    "L_aes_gcm_init_arm64_crypto_start_dw_%=:\n\t"
         "cmp	w12, #4\n\t"
         "b.lt	L_aes_gcm_init_arm64_crypto_start_sw_%=\n\t"
         "ldr	w11, [%x[nonce]], #4\n\t"
         "sub	w12, w12, #4\n\t"
         "str	w11, [%x[initCtr]], #4\n\t"
         "\n"
-    "L_aes_gcm_init_arm64_crypto_start_sw_%=: \n\t"
+    "L_aes_gcm_init_arm64_crypto_start_sw_%=:\n\t"
         "cmp	w12, #2\n\t"
         "b.lt	L_aes_gcm_init_arm64_crypto_start_byte_%=\n\t"
         "ldrh	w11, [%x[nonce]], #2\n\t"
         "sub	w12, w12, #2\n\t"
         "strh	w11, [%x[initCtr]], #2\n\t"
         "\n"
-    "L_aes_gcm_init_arm64_crypto_start_byte_%=: \n\t"
+    "L_aes_gcm_init_arm64_crypto_start_byte_%=:\n\t"
         "cbz	w12, L_aes_gcm_init_arm64_crypto_end_bytes_%=\n\t"
         "ldrb	w11, [%x[nonce]], #1\n\t"
         "subs	w12, w12, #1\n\t"
         "strb	w11, [%x[initCtr]], #1\n\t"
         "b.ne	L_aes_gcm_init_arm64_crypto_start_byte_%=\n\t"
         "\n"
-    "L_aes_gcm_init_arm64_crypto_end_bytes_%=: \n\t"
+    "L_aes_gcm_init_arm64_crypto_end_bytes_%=:\n\t"
         "sub	%x[initCtr], %x[initCtr], x13\n\t"
         "ld1	{v0.2d}, [%x[initCtr]]\n\t"
         "rbit	v0.16b, v0.16b\n\t"
@@ -24773,7 +24779,7 @@ void AES_GCM_init_AARCH64(byte* key, int nr, const byte* nonce, word32 nonceSz,
         "eor	v4.16b, v7.16b, v9.16b\n\t"
         /* Done GHASH */
         "\n"
-    "L_aes_gcm_init_arm64_crypto_partial_done_%=: \n\t"
+    "L_aes_gcm_init_arm64_crypto_partial_done_%=:\n\t"
         "eor	x7, x7, x7\n\t"
         "lsl	x13, %x[nonceSz], #3\n\t"
         "mov	v7.d[0], x7\n\t"
@@ -24798,7 +24804,7 @@ void AES_GCM_init_AARCH64(byte* key, int nr, const byte* nonce, word32 nonceSz,
         "mov	w8, v4.s[3]\n\t"
         "rev	w8, w8\n\t"
         "\n"
-    "L_aes_gcm_init_arm64_crypto_done_nonce_%=: \n\t"
+    "L_aes_gcm_init_arm64_crypto_done_nonce_%=:\n\t"
         "st1	{v4.2d}, [%x[counter]]\n\t"
         "ld1	{v7.2d, v8.2d, v9.2d, v10.2d}, [%x[key]], #0x40\n\t"
         "aese	v4.16b, v7.16b\n\t"
@@ -24837,7 +24843,7 @@ void AES_GCM_init_AARCH64(byte* key, int nr, const byte* nonce, word32 nonceSz,
         "aesmc	v4.16b, v4.16b\n\t"
         "aese	v4.16b, v8.16b\n\t"
         "\n"
-    "L_aes_gcm_init_arm64_crypto_round_done_%=: \n\t"
+    "L_aes_gcm_init_arm64_crypto_round_done_%=:\n\t"
         "ld1	{v7.2d}, [%x[key]]\n\t"
         "eor	v4.16b, v4.16b, v7.16b\n\t"
         "st1	{v4.2d}, [%x[initCtr]]\n\t"
@@ -24981,7 +24987,7 @@ void AES_GCM_aad_update_AARCH64(const byte* aadt, word32 abytes, byte* tag,
         "eor	v19.16b, v10.16b, v11.16b\n\t"
         /* Done */
         "\n"
-    "L_aes_gcm_aad_update_arm64_crypto_h_done_%=: \n\t"
+    "L_aes_gcm_aad_update_arm64_crypto_h_done_%=:\n\t"
         "lsr	%w[abytes], %w[abytes], #4\n\t"
         "cmp	%w[abytes], #4\n\t"
         "b.lt	L_aes_gcm_aad_update_arm64_crypto_start_1_%=\n\t"
@@ -24990,7 +24996,7 @@ void AES_GCM_aad_update_AARCH64(const byte* aadt, word32 abytes, byte* tag,
         "cmp	%w[abytes], #0x40\n\t"
         "b.lt	L_aes_gcm_aad_update_arm64_crypto_start_4_%=\n\t"
         "\n"
-    "L_aes_gcm_aad_update_arm64_crypto_start_8_%=: \n\t"
+    "L_aes_gcm_aad_update_arm64_crypto_start_8_%=:\n\t"
         "ld1	{v0.16b, v1.16b, v2.16b, v3.16b}, [%x[aadt]], #0x40\n\t"
         "ld1	{v4.16b, v5.16b, v6.16b, v7.16b}, [%x[aadt]], #0x40\n\t"
         "rbit	v0.16b, v0.16b\n\t"
@@ -25097,7 +25103,7 @@ void AES_GCM_aad_update_AARCH64(const byte* aadt, word32 abytes, byte* tag,
         "cmp	%w[abytes], #16\n\t"
         "b.lt	L_aes_gcm_aad_update_arm64_crypto_start_2_%=\n\t"
         "\n"
-    "L_aes_gcm_aad_update_arm64_crypto_start_4_%=: \n\t"
+    "L_aes_gcm_aad_update_arm64_crypto_start_4_%=:\n\t"
         "ld1	{v0.16b, v1.16b, v2.16b, v3.16b}, [%x[aadt]], #0x40\n\t"
         "rbit	v0.16b, v0.16b\n\t"
         "rbit	v1.16b, v1.16b\n\t"
@@ -25157,7 +25163,7 @@ void AES_GCM_aad_update_AARCH64(const byte* aadt, word32 abytes, byte* tag,
         "b.lt	L_aes_gcm_aad_update_arm64_crypto_done_%=\n\t"
         "b.eq	L_aes_gcm_aad_update_arm64_crypto_start_1_%=\n\t"
         "\n"
-    "L_aes_gcm_aad_update_arm64_crypto_start_2_%=: \n\t"
+    "L_aes_gcm_aad_update_arm64_crypto_start_2_%=:\n\t"
         "ld1	{v0.16b, v1.16b}, [%x[aadt]], #32\n\t"
         "rbit	v0.16b, v0.16b\n\t"
         "rbit	v1.16b, v1.16b\n\t"
@@ -25193,10 +25199,10 @@ void AES_GCM_aad_update_AARCH64(const byte* aadt, word32 abytes, byte* tag,
         "b.gt	L_aes_gcm_aad_update_arm64_crypto_start_2_%=\n\t"
         "b.lt	L_aes_gcm_aad_update_arm64_crypto_done_%=\n\t"
         "\n"
-    "L_aes_gcm_aad_update_arm64_crypto_start_1_%=: \n\t"
+    "L_aes_gcm_aad_update_arm64_crypto_start_1_%=:\n\t"
         "cbz	%w[abytes], L_aes_gcm_aad_update_arm64_crypto_done_%=\n\t"
         "\n"
-    "L_aes_gcm_aad_update_arm64_crypto_both_1_%=: \n\t"
+    "L_aes_gcm_aad_update_arm64_crypto_both_1_%=:\n\t"
         "ld1	{v0.16b}, [%x[aadt]], #16\n\t"
         "rbit	v0.16b, v0.16b\n\t"
         "eor	v3.16b, v20.16b, v0.16b\n\t"
@@ -25219,7 +25225,7 @@ void AES_GCM_aad_update_AARCH64(const byte* aadt, word32 abytes, byte* tag,
         "subs	%w[abytes], %w[abytes], #1\n\t"
         "b.ne	L_aes_gcm_aad_update_arm64_crypto_both_1_%=\n\t"
         "\n"
-    "L_aes_gcm_aad_update_arm64_crypto_done_%=: \n\t"
+    "L_aes_gcm_aad_update_arm64_crypto_done_%=:\n\t"
         "st1	{v20.2d}, [%x[tag]]\n\t"
         : [abytes] "+r" (abytes), [tag] "+r" (tag), [gcm_h] "+r" (gcm_h)
         : [aadt] "r" (aadt)
@@ -25278,7 +25284,7 @@ void AES_GCM_encrypt_block_AARCH64(const byte* key, int nr, byte* out,
         "aesmc	v5.16b, v5.16b\n\t"
         "aese	v5.16b, v1.16b\n\t"
         "\n"
-    "L_aes_gcm_encrypt_block_arm64_crypto_round_done_%=: \n\t"
+    "L_aes_gcm_encrypt_block_arm64_crypto_round_done_%=:\n\t"
         "ld1	{v0.2d}, [%x[key]]\n\t"
         "eor	v5.16b, v5.16b, v0.16b\n\t"
         "eor	v4.16b, v4.16b, v5.16b\n\t"
@@ -25392,7 +25398,7 @@ void AES_GCM_encrypt_update_AARCH64(const byte* key, int nr, byte* out,
         "eor	v7.16b, v30.16b, v31.16b\n\t"
         /* Done */
         "\n"
-    "L_aes_gcm_encrypt_update_arm64_crypto_h_done_%=: \n\t"
+    "L_aes_gcm_encrypt_update_arm64_crypto_h_done_%=:\n\t"
         "lsr	w8, %w[nbytes], #4\n\t"
         "cmp	%w[nr], #12\n\t"
         "b.lt	L_aes_gcm_encrypt_update_arm64_crypto_start_128_%=\n\t"
@@ -25402,7 +25408,7 @@ void AES_GCM_encrypt_update_AARCH64(const byte* key, int nr, byte* out,
         "cmp	w8, #32\n\t"
         "b.lt	L_aes_gcm_encrypt_update_arm64_crypto_192_start_4_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_update_arm64_crypto_192_start_8_%=: \n\t"
+    "L_aes_gcm_encrypt_update_arm64_crypto_192_start_8_%=:\n\t"
         "ldr	q12, [%x[key]]\n\t"
         "add	w17, w9, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
@@ -25663,7 +25669,7 @@ void AES_GCM_encrypt_update_AARCH64(const byte* key, int nr, byte* out,
         "cmp	w8, #8\n\t"
         "b.lt	L_aes_gcm_encrypt_update_arm64_crypto_192_end_8_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_update_arm64_crypto_192_both_8_%=: \n\t"
+    "L_aes_gcm_encrypt_update_arm64_crypto_192_both_8_%=:\n\t"
         "ldr	q12, [%x[key]]\n\t"
         "add	w17, w9, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
@@ -26019,7 +26025,7 @@ void AES_GCM_encrypt_update_AARCH64(const byte* key, int nr, byte* out,
         "cmp	w8, #8\n\t"
         "b.ge	L_aes_gcm_encrypt_update_arm64_crypto_192_both_8_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_update_arm64_crypto_192_end_8_%=: \n\t"
+    "L_aes_gcm_encrypt_update_arm64_crypto_192_end_8_%=:\n\t"
         "rbit	v18.16b, v18.16b\n\t"
         "rbit	v19.16b, v19.16b\n\t"
         "rbit	v20.16b, v20.16b\n\t"
@@ -26116,7 +26122,7 @@ void AES_GCM_encrypt_update_AARCH64(const byte* key, int nr, byte* out,
         "eor	v26.16b, v28.16b, v30.16b\n\t"
         /* Done GHASH */
         "\n"
-    "L_aes_gcm_encrypt_update_arm64_crypto_192_start_4_%=: \n\t"
+    "L_aes_gcm_encrypt_update_arm64_crypto_192_start_4_%=:\n\t"
         "ld1	{v0.2d, v1.2d, v2.2d, v3.2d}, [%x[key]], #0x40\n\t"
         "ld1	{v4.2d, v5.2d, v6.2d, v7.2d}, [%x[key]], #0x40\n\t"
         "ld1	{v8.2d, v9.2d, v10.2d, v11.2d}, [%x[key]], #0x40\n\t"
@@ -26251,7 +26257,7 @@ void AES_GCM_encrypt_update_AARCH64(const byte* key, int nr, byte* out,
         "st1	{v18.16b, v19.16b, v20.16b, v21.16b}, [%x[out]], #0x40\n\t"
         "b.lt	L_aes_gcm_encrypt_update_arm64_crypto_192_end_4_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_update_arm64_crypto_192_both_4_%=: \n\t"
+    "L_aes_gcm_encrypt_update_arm64_crypto_192_both_4_%=:\n\t"
         "add	w13, w9, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
         "add	w12, w9, #2\n\t"
@@ -26425,7 +26431,7 @@ void AES_GCM_encrypt_update_AARCH64(const byte* key, int nr, byte* out,
         "st1	{v18.16b, v19.16b, v20.16b, v21.16b}, [%x[out]], #0x40\n\t"
         "b.ge	L_aes_gcm_encrypt_update_arm64_crypto_192_both_4_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_update_arm64_crypto_192_end_4_%=: \n\t"
+    "L_aes_gcm_encrypt_update_arm64_crypto_192_end_4_%=:\n\t"
         "rbit	v18.16b, v18.16b\n\t"
         "rbit	v19.16b, v19.16b\n\t"
         "rbit	v20.16b, v20.16b\n\t"
@@ -26481,7 +26487,7 @@ void AES_GCM_encrypt_update_AARCH64(const byte* key, int nr, byte* out,
         "b.eq	L_aes_gcm_encrypt_update_arm64_crypto_192_start_1_%=\n\t"
         "b.lt	L_aes_gcm_encrypt_update_arm64_crypto_192_done_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_update_arm64_crypto_192_start_2_%=: \n\t"
+    "L_aes_gcm_encrypt_update_arm64_crypto_192_start_2_%=:\n\t"
         "add	w13, w9, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
         "add	w9, w9, #2\n\t"
@@ -26575,7 +26581,7 @@ void AES_GCM_encrypt_update_AARCH64(const byte* key, int nr, byte* out,
         /* Done GHASH */
         "cbz	w8, L_aes_gcm_encrypt_update_arm64_crypto_192_done_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_update_arm64_crypto_192_start_1_%=: \n\t"
+    "L_aes_gcm_encrypt_update_arm64_crypto_192_start_1_%=:\n\t"
         "add	w9, w9, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
         "rev	w10, w9\n\t"
@@ -26626,17 +26632,17 @@ void AES_GCM_encrypt_update_AARCH64(const byte* key, int nr, byte* out,
         "eor	v26.16b, v28.16b, v30.16b\n\t"
         /* Done GHASH */
         "\n"
-    "L_aes_gcm_encrypt_update_arm64_crypto_192_done_%=: \n\t"
+    "L_aes_gcm_encrypt_update_arm64_crypto_192_done_%=:\n\t"
 #endif /* !NO_AES_192 */
         "b	L_aes_gcm_encrypt_update_arm64_crypto_done_%=\n\t"
         /* AES_GCM_256 */
         "\n"
-    "L_aes_gcm_encrypt_update_arm64_crypto_start_256_%=: \n\t"
+    "L_aes_gcm_encrypt_update_arm64_crypto_start_256_%=:\n\t"
 #ifndef NO_AES_256
         "cmp	w8, #32\n\t"
         "b.lt	L_aes_gcm_encrypt_update_arm64_crypto_256_start_4_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_update_arm64_crypto_256_start_8_%=: \n\t"
+    "L_aes_gcm_encrypt_update_arm64_crypto_256_start_8_%=:\n\t"
         "ldr	q12, [%x[key]]\n\t"
         "add	w17, w9, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
@@ -26931,7 +26937,7 @@ void AES_GCM_encrypt_update_AARCH64(const byte* key, int nr, byte* out,
         "cmp	w8, #8\n\t"
         "b.lt	L_aes_gcm_encrypt_update_arm64_crypto_256_end_8_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_update_arm64_crypto_256_both_8_%=: \n\t"
+    "L_aes_gcm_encrypt_update_arm64_crypto_256_both_8_%=:\n\t"
         "ldr	q12, [%x[key]]\n\t"
         "add	w17, w9, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
@@ -27321,7 +27327,7 @@ void AES_GCM_encrypt_update_AARCH64(const byte* key, int nr, byte* out,
         "cmp	w8, #8\n\t"
         "b.ge	L_aes_gcm_encrypt_update_arm64_crypto_256_both_8_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_update_arm64_crypto_256_end_8_%=: \n\t"
+    "L_aes_gcm_encrypt_update_arm64_crypto_256_end_8_%=:\n\t"
         "rbit	v18.16b, v18.16b\n\t"
         "rbit	v19.16b, v19.16b\n\t"
         "rbit	v20.16b, v20.16b\n\t"
@@ -27418,7 +27424,7 @@ void AES_GCM_encrypt_update_AARCH64(const byte* key, int nr, byte* out,
         "eor	v26.16b, v28.16b, v30.16b\n\t"
         /* Done GHASH */
         "\n"
-    "L_aes_gcm_encrypt_update_arm64_crypto_256_start_4_%=: \n\t"
+    "L_aes_gcm_encrypt_update_arm64_crypto_256_start_4_%=:\n\t"
         "ld1	{v0.2d, v1.2d, v2.2d, v3.2d}, [%x[key]], #0x40\n\t"
         "ld1	{v4.2d, v5.2d, v6.2d, v7.2d}, [%x[key]], #0x40\n\t"
         "ld1	{v8.2d, v9.2d, v10.2d, v11.2d}, [%x[key]], #0x40\n\t"
@@ -27570,7 +27576,7 @@ void AES_GCM_encrypt_update_AARCH64(const byte* key, int nr, byte* out,
         "st1	{v18.16b, v19.16b, v20.16b, v21.16b}, [%x[out]], #0x40\n\t"
         "b.lt	L_aes_gcm_encrypt_update_arm64_crypto_256_end_4_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_update_arm64_crypto_256_both_4_%=: \n\t"
+    "L_aes_gcm_encrypt_update_arm64_crypto_256_both_4_%=:\n\t"
         "add	w13, w9, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
         "add	w12, w9, #2\n\t"
@@ -27761,7 +27767,7 @@ void AES_GCM_encrypt_update_AARCH64(const byte* key, int nr, byte* out,
         "st1	{v18.16b, v19.16b, v20.16b, v21.16b}, [%x[out]], #0x40\n\t"
         "b.ge	L_aes_gcm_encrypt_update_arm64_crypto_256_both_4_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_update_arm64_crypto_256_end_4_%=: \n\t"
+    "L_aes_gcm_encrypt_update_arm64_crypto_256_end_4_%=:\n\t"
         "rbit	v18.16b, v18.16b\n\t"
         "rbit	v19.16b, v19.16b\n\t"
         "rbit	v20.16b, v20.16b\n\t"
@@ -27817,7 +27823,7 @@ void AES_GCM_encrypt_update_AARCH64(const byte* key, int nr, byte* out,
         "b.eq	L_aes_gcm_encrypt_update_arm64_crypto_256_start_1_%=\n\t"
         "b.lt	L_aes_gcm_encrypt_update_arm64_crypto_256_done_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_update_arm64_crypto_256_start_2_%=: \n\t"
+    "L_aes_gcm_encrypt_update_arm64_crypto_256_start_2_%=:\n\t"
         "add	w13, w9, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
         "add	w9, w9, #2\n\t"
@@ -27920,7 +27926,7 @@ void AES_GCM_encrypt_update_AARCH64(const byte* key, int nr, byte* out,
         /* Done GHASH */
         "cbz	w8, L_aes_gcm_encrypt_update_arm64_crypto_256_done_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_update_arm64_crypto_256_start_1_%=: \n\t"
+    "L_aes_gcm_encrypt_update_arm64_crypto_256_start_1_%=:\n\t"
         "add	w9, w9, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
         "rev	w10, w9\n\t"
@@ -27977,17 +27983,17 @@ void AES_GCM_encrypt_update_AARCH64(const byte* key, int nr, byte* out,
         "eor	v26.16b, v28.16b, v30.16b\n\t"
         /* Done GHASH */
         "\n"
-    "L_aes_gcm_encrypt_update_arm64_crypto_256_done_%=: \n\t"
+    "L_aes_gcm_encrypt_update_arm64_crypto_256_done_%=:\n\t"
 #endif /* !NO_AES_256 */
         "b	L_aes_gcm_encrypt_update_arm64_crypto_done_%=\n\t"
         /* AES_GCM_128 */
         "\n"
-    "L_aes_gcm_encrypt_update_arm64_crypto_start_128_%=: \n\t"
+    "L_aes_gcm_encrypt_update_arm64_crypto_start_128_%=:\n\t"
 #ifndef NO_AES_128
         "cmp	w8, #32\n\t"
         "b.lt	L_aes_gcm_encrypt_update_arm64_crypto_128_start_4_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_update_arm64_crypto_128_start_8_%=: \n\t"
+    "L_aes_gcm_encrypt_update_arm64_crypto_128_start_8_%=:\n\t"
         "ldr	q12, [%x[key]]\n\t"
         "add	w17, w9, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
@@ -28214,7 +28220,7 @@ void AES_GCM_encrypt_update_AARCH64(const byte* key, int nr, byte* out,
         "cmp	w8, #8\n\t"
         "b.lt	L_aes_gcm_encrypt_update_arm64_crypto_128_end_8_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_update_arm64_crypto_128_both_8_%=: \n\t"
+    "L_aes_gcm_encrypt_update_arm64_crypto_128_both_8_%=:\n\t"
         "ldr	q12, [%x[key]]\n\t"
         "add	w17, w9, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
@@ -28536,7 +28542,7 @@ void AES_GCM_encrypt_update_AARCH64(const byte* key, int nr, byte* out,
         "cmp	w8, #8\n\t"
         "b.ge	L_aes_gcm_encrypt_update_arm64_crypto_128_both_8_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_update_arm64_crypto_128_end_8_%=: \n\t"
+    "L_aes_gcm_encrypt_update_arm64_crypto_128_end_8_%=:\n\t"
         "rbit	v18.16b, v18.16b\n\t"
         "rbit	v19.16b, v19.16b\n\t"
         "rbit	v20.16b, v20.16b\n\t"
@@ -28633,7 +28639,7 @@ void AES_GCM_encrypt_update_AARCH64(const byte* key, int nr, byte* out,
         "eor	v26.16b, v28.16b, v30.16b\n\t"
         /* Done GHASH */
         "\n"
-    "L_aes_gcm_encrypt_update_arm64_crypto_128_start_4_%=: \n\t"
+    "L_aes_gcm_encrypt_update_arm64_crypto_128_start_4_%=:\n\t"
         "ld1	{v0.2d, v1.2d, v2.2d, v3.2d}, [%x[key]], #0x40\n\t"
         "ld1	{v4.2d, v5.2d, v6.2d, v7.2d}, [%x[key]], #0x40\n\t"
         "ld1	{v8.2d, v9.2d}, [%x[key]], #32\n\t"
@@ -28752,7 +28758,7 @@ void AES_GCM_encrypt_update_AARCH64(const byte* key, int nr, byte* out,
         "st1	{v18.16b, v19.16b, v20.16b, v21.16b}, [%x[out]], #0x40\n\t"
         "b.lt	L_aes_gcm_encrypt_update_arm64_crypto_128_end_4_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_update_arm64_crypto_128_both_4_%=: \n\t"
+    "L_aes_gcm_encrypt_update_arm64_crypto_128_both_4_%=:\n\t"
         "add	w13, w9, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
         "add	w12, w9, #2\n\t"
@@ -28910,7 +28916,7 @@ void AES_GCM_encrypt_update_AARCH64(const byte* key, int nr, byte* out,
         "st1	{v18.16b, v19.16b, v20.16b, v21.16b}, [%x[out]], #0x40\n\t"
         "b.ge	L_aes_gcm_encrypt_update_arm64_crypto_128_both_4_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_update_arm64_crypto_128_end_4_%=: \n\t"
+    "L_aes_gcm_encrypt_update_arm64_crypto_128_end_4_%=:\n\t"
         "rbit	v18.16b, v18.16b\n\t"
         "rbit	v19.16b, v19.16b\n\t"
         "rbit	v20.16b, v20.16b\n\t"
@@ -28966,7 +28972,7 @@ void AES_GCM_encrypt_update_AARCH64(const byte* key, int nr, byte* out,
         "b.eq	L_aes_gcm_encrypt_update_arm64_crypto_128_start_1_%=\n\t"
         "b.lt	L_aes_gcm_encrypt_update_arm64_crypto_128_done_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_update_arm64_crypto_128_start_2_%=: \n\t"
+    "L_aes_gcm_encrypt_update_arm64_crypto_128_start_2_%=:\n\t"
         "add	w13, w9, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
         "add	w9, w9, #2\n\t"
@@ -29052,7 +29058,7 @@ void AES_GCM_encrypt_update_AARCH64(const byte* key, int nr, byte* out,
         /* Done GHASH */
         "cbz	w8, L_aes_gcm_encrypt_update_arm64_crypto_128_done_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_update_arm64_crypto_128_start_1_%=: \n\t"
+    "L_aes_gcm_encrypt_update_arm64_crypto_128_start_1_%=:\n\t"
         "add	w9, w9, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
         "rev	w10, w9\n\t"
@@ -29099,10 +29105,10 @@ void AES_GCM_encrypt_update_AARCH64(const byte* key, int nr, byte* out,
         "eor	v26.16b, v28.16b, v30.16b\n\t"
         /* Done GHASH */
         "\n"
-    "L_aes_gcm_encrypt_update_arm64_crypto_128_done_%=: \n\t"
+    "L_aes_gcm_encrypt_update_arm64_crypto_128_done_%=:\n\t"
 #endif /* !NO_AES_128 */
         "\n"
-    "L_aes_gcm_encrypt_update_arm64_crypto_done_%=: \n\t"
+    "L_aes_gcm_encrypt_update_arm64_crypto_done_%=:\n\t"
         "rev	w9, w9\n\t"
         "mov	v13.s[3], w9\n\t"
         "st1	{v26.2d}, [%x[tag]]\n\t"
@@ -29156,7 +29162,7 @@ void AES_GCM_encrypt_final_AARCH64(byte* tag, byte* authTag, word32 tbytes,
         "st1	{v5.16b}, [%x[authTag]]\n\t"
         "b	L_aes_gcm_encrypt_final_arm64_crypto_done_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_final_arm64_crypto_tag_partial_%=: \n\t"
+    "L_aes_gcm_encrypt_final_arm64_crypto_tag_partial_%=:\n\t"
         "st1	{v5.16b}, [%x[tag]]\n\t"
         "cmp	%w[tbytes], #8\n\t"
         "b.lt	L_aes_gcm_encrypt_final_arm64_crypto_tag_start_dw_%=\n\t"
@@ -29164,30 +29170,30 @@ void AES_GCM_encrypt_final_AARCH64(byte* tag, byte* authTag, word32 tbytes,
         "sub	%w[tbytes], %w[tbytes], #8\n\t"
         "str	x8, [%x[authTag]], #8\n\t"
         "\n"
-    "L_aes_gcm_encrypt_final_arm64_crypto_tag_start_dw_%=: \n\t"
+    "L_aes_gcm_encrypt_final_arm64_crypto_tag_start_dw_%=:\n\t"
         "cmp	%w[tbytes], #4\n\t"
         "b.lt	L_aes_gcm_encrypt_final_arm64_crypto_tag_start_sw_%=\n\t"
         "ldr	w8, [%x[tag]], #4\n\t"
         "sub	%w[tbytes], %w[tbytes], #4\n\t"
         "str	w8, [%x[authTag]], #4\n\t"
         "\n"
-    "L_aes_gcm_encrypt_final_arm64_crypto_tag_start_sw_%=: \n\t"
+    "L_aes_gcm_encrypt_final_arm64_crypto_tag_start_sw_%=:\n\t"
         "cmp	%w[tbytes], #2\n\t"
         "b.lt	L_aes_gcm_encrypt_final_arm64_crypto_tag_start_byte_%=\n\t"
         "ldrh	w8, [%x[tag]], #2\n\t"
         "sub	%w[tbytes], %w[tbytes], #2\n\t"
         "strh	w8, [%x[authTag]], #2\n\t"
         "\n"
-    "L_aes_gcm_encrypt_final_arm64_crypto_tag_start_byte_%=: \n\t"
+    "L_aes_gcm_encrypt_final_arm64_crypto_tag_start_byte_%=:\n\t"
         "cbz	%w[tbytes], L_aes_gcm_encrypt_final_arm64_crypto_tag_end_bytes_%=\n\t"
         "ldrb	w8, [%x[tag]], #1\n\t"
         "subs	%w[tbytes], %w[tbytes], #1\n\t"
         "strb	w8, [%x[authTag]], #1\n\t"
         "b.ne	L_aes_gcm_encrypt_final_arm64_crypto_tag_start_byte_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_final_arm64_crypto_tag_end_bytes_%=: \n\t"
+    "L_aes_gcm_encrypt_final_arm64_crypto_tag_end_bytes_%=:\n\t"
         "\n"
-    "L_aes_gcm_encrypt_final_arm64_crypto_done_%=: \n\t"
+    "L_aes_gcm_encrypt_final_arm64_crypto_done_%=:\n\t"
         : [tag] "+r" (tag), [authTag] "+r" (authTag), [tbytes] "+r" (tbytes),
           [nbytes] "+r" (nbytes), [abytes] "+r" (abytes), [h] "+r" (h),
           [initCtr] "+r" (initCtr)
@@ -29300,7 +29306,7 @@ void AES_GCM_decrypt_update_AARCH64(const byte* key, int nr, byte* out,
         "eor	v7.16b, v30.16b, v31.16b\n\t"
         /* Done */
         "\n"
-    "L_aes_gcm_decrypt_update_arm64_crypto_h_done_%=: \n\t"
+    "L_aes_gcm_decrypt_update_arm64_crypto_h_done_%=:\n\t"
         "lsr	w8, %w[nbytes], #4\n\t"
         "cmp	%w[nr], #12\n\t"
         "b.lt	L_aes_gcm_decrypt_update_arm64_crypto_start_128_%=\n\t"
@@ -29310,7 +29316,7 @@ void AES_GCM_decrypt_update_AARCH64(const byte* key, int nr, byte* out,
         "cmp	w8, #32\n\t"
         "b.lt	L_aes_gcm_decrypt_update_arm64_crypto_192_start_4_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_update_arm64_crypto_192_start_8_%=: \n\t"
+    "L_aes_gcm_decrypt_update_arm64_crypto_192_start_8_%=:\n\t"
         "ldr	q12, [%x[key]]\n\t"
         "add	w17, w9, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
@@ -29571,7 +29577,7 @@ void AES_GCM_decrypt_update_AARCH64(const byte* key, int nr, byte* out,
         "cmp	w8, #8\n\t"
         "b.lt	L_aes_gcm_decrypt_update_arm64_crypto_192_end_8_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_update_arm64_crypto_192_both_8_%=: \n\t"
+    "L_aes_gcm_decrypt_update_arm64_crypto_192_both_8_%=:\n\t"
         "ldr	q12, [%x[key]]\n\t"
         "add	w17, w9, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
@@ -29927,7 +29933,7 @@ void AES_GCM_decrypt_update_AARCH64(const byte* key, int nr, byte* out,
         "cmp	w8, #8\n\t"
         "b.ge	L_aes_gcm_decrypt_update_arm64_crypto_192_both_8_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_update_arm64_crypto_192_end_8_%=: \n\t"
+    "L_aes_gcm_decrypt_update_arm64_crypto_192_end_8_%=:\n\t"
         "rbit	v18.16b, v18.16b\n\t"
         "rbit	v19.16b, v19.16b\n\t"
         "rbit	v20.16b, v20.16b\n\t"
@@ -30024,7 +30030,7 @@ void AES_GCM_decrypt_update_AARCH64(const byte* key, int nr, byte* out,
         "eor	v26.16b, v28.16b, v30.16b\n\t"
         /* Done GHASH */
         "\n"
-    "L_aes_gcm_decrypt_update_arm64_crypto_192_start_4_%=: \n\t"
+    "L_aes_gcm_decrypt_update_arm64_crypto_192_start_4_%=:\n\t"
         "ld1	{v0.2d, v1.2d, v2.2d, v3.2d}, [%x[key]], #0x40\n\t"
         "ld1	{v4.2d, v5.2d, v6.2d, v7.2d}, [%x[key]], #0x40\n\t"
         "ld1	{v8.2d, v9.2d, v10.2d, v11.2d}, [%x[key]], #0x40\n\t"
@@ -30159,7 +30165,7 @@ void AES_GCM_decrypt_update_AARCH64(const byte* key, int nr, byte* out,
         "st1	{v14.16b, v15.16b, v16.16b, v17.16b}, [%x[out]], #0x40\n\t"
         "b.lt	L_aes_gcm_decrypt_update_arm64_crypto_192_end_4_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_update_arm64_crypto_192_both_4_%=: \n\t"
+    "L_aes_gcm_decrypt_update_arm64_crypto_192_both_4_%=:\n\t"
         "add	w13, w9, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
         "add	w12, w9, #2\n\t"
@@ -30333,7 +30339,7 @@ void AES_GCM_decrypt_update_AARCH64(const byte* key, int nr, byte* out,
         "st1	{v14.16b, v15.16b, v16.16b, v17.16b}, [%x[out]], #0x40\n\t"
         "b.ge	L_aes_gcm_decrypt_update_arm64_crypto_192_both_4_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_update_arm64_crypto_192_end_4_%=: \n\t"
+    "L_aes_gcm_decrypt_update_arm64_crypto_192_end_4_%=:\n\t"
         "rbit	v18.16b, v18.16b\n\t"
         "rbit	v19.16b, v19.16b\n\t"
         "rbit	v20.16b, v20.16b\n\t"
@@ -30389,7 +30395,7 @@ void AES_GCM_decrypt_update_AARCH64(const byte* key, int nr, byte* out,
         "b.eq	L_aes_gcm_decrypt_update_arm64_crypto_192_start_1_%=\n\t"
         "b.lt	L_aes_gcm_decrypt_update_arm64_crypto_192_done_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_update_arm64_crypto_192_start_2_%=: \n\t"
+    "L_aes_gcm_decrypt_update_arm64_crypto_192_start_2_%=:\n\t"
         "add	w13, w9, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
         "add	w9, w9, #2\n\t"
@@ -30483,7 +30489,7 @@ void AES_GCM_decrypt_update_AARCH64(const byte* key, int nr, byte* out,
         /* Done GHASH */
         "cbz	w8, L_aes_gcm_decrypt_update_arm64_crypto_192_done_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_update_arm64_crypto_192_start_1_%=: \n\t"
+    "L_aes_gcm_decrypt_update_arm64_crypto_192_start_1_%=:\n\t"
         "ld1	{v15.16b}, [%x[in]], #16\n\t"
         "add	w9, w9, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
@@ -30535,17 +30541,17 @@ void AES_GCM_decrypt_update_AARCH64(const byte* key, int nr, byte* out,
         "eor	v14.16b, v14.16b, v15.16b\n\t"
         "st1	{v14.16b}, [%x[out]], #16\n\t"
         "\n"
-    "L_aes_gcm_decrypt_update_arm64_crypto_192_done_%=: \n\t"
+    "L_aes_gcm_decrypt_update_arm64_crypto_192_done_%=:\n\t"
 #endif /* !NO_AES_192 */
         "b	L_aes_gcm_decrypt_update_arm64_crypto_done_%=\n\t"
         /* AES_GCM_256 */
         "\n"
-    "L_aes_gcm_decrypt_update_arm64_crypto_start_256_%=: \n\t"
+    "L_aes_gcm_decrypt_update_arm64_crypto_start_256_%=:\n\t"
 #ifndef NO_AES_256
         "cmp	w8, #32\n\t"
         "b.lt	L_aes_gcm_decrypt_update_arm64_crypto_256_start_4_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_update_arm64_crypto_256_start_8_%=: \n\t"
+    "L_aes_gcm_decrypt_update_arm64_crypto_256_start_8_%=:\n\t"
         "ldr	q12, [%x[key]]\n\t"
         "add	w17, w9, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
@@ -30840,7 +30846,7 @@ void AES_GCM_decrypt_update_AARCH64(const byte* key, int nr, byte* out,
         "cmp	w8, #8\n\t"
         "b.lt	L_aes_gcm_decrypt_update_arm64_crypto_256_end_8_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_update_arm64_crypto_256_both_8_%=: \n\t"
+    "L_aes_gcm_decrypt_update_arm64_crypto_256_both_8_%=:\n\t"
         "ldr	q12, [%x[key]]\n\t"
         "add	w17, w9, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
@@ -31230,7 +31236,7 @@ void AES_GCM_decrypt_update_AARCH64(const byte* key, int nr, byte* out,
         "cmp	w8, #8\n\t"
         "b.ge	L_aes_gcm_decrypt_update_arm64_crypto_256_both_8_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_update_arm64_crypto_256_end_8_%=: \n\t"
+    "L_aes_gcm_decrypt_update_arm64_crypto_256_end_8_%=:\n\t"
         "rbit	v18.16b, v18.16b\n\t"
         "rbit	v19.16b, v19.16b\n\t"
         "rbit	v20.16b, v20.16b\n\t"
@@ -31327,7 +31333,7 @@ void AES_GCM_decrypt_update_AARCH64(const byte* key, int nr, byte* out,
         "eor	v26.16b, v28.16b, v30.16b\n\t"
         /* Done GHASH */
         "\n"
-    "L_aes_gcm_decrypt_update_arm64_crypto_256_start_4_%=: \n\t"
+    "L_aes_gcm_decrypt_update_arm64_crypto_256_start_4_%=:\n\t"
         "ld1	{v0.2d, v1.2d, v2.2d, v3.2d}, [%x[key]], #0x40\n\t"
         "ld1	{v4.2d, v5.2d, v6.2d, v7.2d}, [%x[key]], #0x40\n\t"
         "ld1	{v8.2d, v9.2d, v10.2d, v11.2d}, [%x[key]], #0x40\n\t"
@@ -31479,7 +31485,7 @@ void AES_GCM_decrypt_update_AARCH64(const byte* key, int nr, byte* out,
         "st1	{v14.16b, v15.16b, v16.16b, v17.16b}, [%x[out]], #0x40\n\t"
         "b.lt	L_aes_gcm_decrypt_update_arm64_crypto_256_end_4_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_update_arm64_crypto_256_both_4_%=: \n\t"
+    "L_aes_gcm_decrypt_update_arm64_crypto_256_both_4_%=:\n\t"
         "add	w13, w9, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
         "add	w12, w9, #2\n\t"
@@ -31670,7 +31676,7 @@ void AES_GCM_decrypt_update_AARCH64(const byte* key, int nr, byte* out,
         "st1	{v14.16b, v15.16b, v16.16b, v17.16b}, [%x[out]], #0x40\n\t"
         "b.ge	L_aes_gcm_decrypt_update_arm64_crypto_256_both_4_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_update_arm64_crypto_256_end_4_%=: \n\t"
+    "L_aes_gcm_decrypt_update_arm64_crypto_256_end_4_%=:\n\t"
         "rbit	v18.16b, v18.16b\n\t"
         "rbit	v19.16b, v19.16b\n\t"
         "rbit	v20.16b, v20.16b\n\t"
@@ -31726,7 +31732,7 @@ void AES_GCM_decrypt_update_AARCH64(const byte* key, int nr, byte* out,
         "b.eq	L_aes_gcm_decrypt_update_arm64_crypto_256_start_1_%=\n\t"
         "b.lt	L_aes_gcm_decrypt_update_arm64_crypto_256_done_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_update_arm64_crypto_256_start_2_%=: \n\t"
+    "L_aes_gcm_decrypt_update_arm64_crypto_256_start_2_%=:\n\t"
         "add	w13, w9, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
         "add	w9, w9, #2\n\t"
@@ -31829,7 +31835,7 @@ void AES_GCM_decrypt_update_AARCH64(const byte* key, int nr, byte* out,
         /* Done GHASH */
         "cbz	w8, L_aes_gcm_decrypt_update_arm64_crypto_256_done_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_update_arm64_crypto_256_start_1_%=: \n\t"
+    "L_aes_gcm_decrypt_update_arm64_crypto_256_start_1_%=:\n\t"
         "add	w9, w9, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
         "rev	w10, w9\n\t"
@@ -31886,17 +31892,17 @@ void AES_GCM_decrypt_update_AARCH64(const byte* key, int nr, byte* out,
         "eor	v26.16b, v28.16b, v30.16b\n\t"
         /* Done GHASH */
         "\n"
-    "L_aes_gcm_decrypt_update_arm64_crypto_256_done_%=: \n\t"
+    "L_aes_gcm_decrypt_update_arm64_crypto_256_done_%=:\n\t"
 #endif /* !NO_AES_256 */
         "b	L_aes_gcm_decrypt_update_arm64_crypto_done_%=\n\t"
         /* AES_GCM_128 */
         "\n"
-    "L_aes_gcm_decrypt_update_arm64_crypto_start_128_%=: \n\t"
+    "L_aes_gcm_decrypt_update_arm64_crypto_start_128_%=:\n\t"
 #ifndef NO_AES_128
         "cmp	w8, #32\n\t"
         "b.lt	L_aes_gcm_decrypt_update_arm64_crypto_128_start_4_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_update_arm64_crypto_128_start_8_%=: \n\t"
+    "L_aes_gcm_decrypt_update_arm64_crypto_128_start_8_%=:\n\t"
         "ldr	q12, [%x[key]]\n\t"
         "add	w17, w9, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
@@ -32123,7 +32129,7 @@ void AES_GCM_decrypt_update_AARCH64(const byte* key, int nr, byte* out,
         "cmp	w8, #8\n\t"
         "b.lt	L_aes_gcm_decrypt_update_arm64_crypto_128_end_8_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_update_arm64_crypto_128_both_8_%=: \n\t"
+    "L_aes_gcm_decrypt_update_arm64_crypto_128_both_8_%=:\n\t"
         "ldr	q12, [%x[key]]\n\t"
         "add	w17, w9, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
@@ -32445,7 +32451,7 @@ void AES_GCM_decrypt_update_AARCH64(const byte* key, int nr, byte* out,
         "cmp	w8, #8\n\t"
         "b.ge	L_aes_gcm_decrypt_update_arm64_crypto_128_both_8_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_update_arm64_crypto_128_end_8_%=: \n\t"
+    "L_aes_gcm_decrypt_update_arm64_crypto_128_end_8_%=:\n\t"
         "rbit	v18.16b, v18.16b\n\t"
         "rbit	v19.16b, v19.16b\n\t"
         "rbit	v20.16b, v20.16b\n\t"
@@ -32542,7 +32548,7 @@ void AES_GCM_decrypt_update_AARCH64(const byte* key, int nr, byte* out,
         "eor	v26.16b, v28.16b, v30.16b\n\t"
         /* Done GHASH */
         "\n"
-    "L_aes_gcm_decrypt_update_arm64_crypto_128_start_4_%=: \n\t"
+    "L_aes_gcm_decrypt_update_arm64_crypto_128_start_4_%=:\n\t"
         "ld1	{v0.2d, v1.2d, v2.2d, v3.2d}, [%x[key]], #0x40\n\t"
         "ld1	{v4.2d, v5.2d, v6.2d, v7.2d}, [%x[key]], #0x40\n\t"
         "ld1	{v8.2d, v9.2d}, [%x[key]], #32\n\t"
@@ -32661,7 +32667,7 @@ void AES_GCM_decrypt_update_AARCH64(const byte* key, int nr, byte* out,
         "st1	{v14.16b, v15.16b, v16.16b, v17.16b}, [%x[out]], #0x40\n\t"
         "b.lt	L_aes_gcm_decrypt_update_arm64_crypto_128_end_4_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_update_arm64_crypto_128_both_4_%=: \n\t"
+    "L_aes_gcm_decrypt_update_arm64_crypto_128_both_4_%=:\n\t"
         "add	w13, w9, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
         "add	w12, w9, #2\n\t"
@@ -32819,7 +32825,7 @@ void AES_GCM_decrypt_update_AARCH64(const byte* key, int nr, byte* out,
         "st1	{v14.16b, v15.16b, v16.16b, v17.16b}, [%x[out]], #0x40\n\t"
         "b.ge	L_aes_gcm_decrypt_update_arm64_crypto_128_both_4_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_update_arm64_crypto_128_end_4_%=: \n\t"
+    "L_aes_gcm_decrypt_update_arm64_crypto_128_end_4_%=:\n\t"
         "rbit	v18.16b, v18.16b\n\t"
         "rbit	v19.16b, v19.16b\n\t"
         "rbit	v20.16b, v20.16b\n\t"
@@ -32875,7 +32881,7 @@ void AES_GCM_decrypt_update_AARCH64(const byte* key, int nr, byte* out,
         "b.eq	L_aes_gcm_decrypt_update_arm64_crypto_128_start_1_%=\n\t"
         "b.lt	L_aes_gcm_decrypt_update_arm64_crypto_128_done_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_update_arm64_crypto_128_start_2_%=: \n\t"
+    "L_aes_gcm_decrypt_update_arm64_crypto_128_start_2_%=:\n\t"
         "add	w13, w9, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
         "add	w9, w9, #2\n\t"
@@ -32961,7 +32967,7 @@ void AES_GCM_decrypt_update_AARCH64(const byte* key, int nr, byte* out,
         /* Done GHASH */
         "cbz	w8, L_aes_gcm_decrypt_update_arm64_crypto_128_done_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_update_arm64_crypto_128_start_1_%=: \n\t"
+    "L_aes_gcm_decrypt_update_arm64_crypto_128_start_1_%=:\n\t"
         "add	w9, w9, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
         "rev	w10, w9\n\t"
@@ -33008,10 +33014,10 @@ void AES_GCM_decrypt_update_AARCH64(const byte* key, int nr, byte* out,
         "eor	v26.16b, v28.16b, v30.16b\n\t"
         /* Done GHASH */
         "\n"
-    "L_aes_gcm_decrypt_update_arm64_crypto_128_done_%=: \n\t"
+    "L_aes_gcm_decrypt_update_arm64_crypto_128_done_%=:\n\t"
 #endif /* !NO_AES_128 */
         "\n"
-    "L_aes_gcm_decrypt_update_arm64_crypto_done_%=: \n\t"
+    "L_aes_gcm_decrypt_update_arm64_crypto_done_%=:\n\t"
         "rev	w9, w9\n\t"
         "mov	v13.s[3], w9\n\t"
         "st1	{v26.2d}, [%x[tag]]\n\t"
@@ -33068,7 +33074,7 @@ void AES_GCM_decrypt_final_AARCH64(byte* tag, const byte* authTag,
         "ld1	{v0.16b}, [%x[authTag]]\n\t"
         "b	L_aes_gcm_decrypt_final_arm64_crypto_tag_loaded_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_final_arm64_crypto_part_tag_%=: \n\t"
+    "L_aes_gcm_decrypt_final_arm64_crypto_part_tag_%=:\n\t"
         "eor	v0.16b, v0.16b, v0.16b\n\t"
         "mov	x10, %x[tbytes]\n\t"
         "st1	{v0.2d}, [%x[tag]]\n\t"
@@ -33078,28 +33084,28 @@ void AES_GCM_decrypt_final_AARCH64(byte* tag, const byte* authTag,
         "sub	x10, x10, #8\n\t"
         "str	x9, [%x[tag]], #8\n\t"
         "\n"
-    "L_aes_gcm_decrypt_final_arm64_crypto_tag_start_dw_%=: \n\t"
+    "L_aes_gcm_decrypt_final_arm64_crypto_tag_start_dw_%=:\n\t"
         "cmp	x10, #4\n\t"
         "b.lt	L_aes_gcm_decrypt_final_arm64_crypto_tag_start_sw_%=\n\t"
         "ldr	w9, [%x[authTag]], #4\n\t"
         "sub	x10, x10, #4\n\t"
         "str	w9, [%x[tag]], #4\n\t"
         "\n"
-    "L_aes_gcm_decrypt_final_arm64_crypto_tag_start_sw_%=: \n\t"
+    "L_aes_gcm_decrypt_final_arm64_crypto_tag_start_sw_%=:\n\t"
         "cmp	x10, #2\n\t"
         "b.lt	L_aes_gcm_decrypt_final_arm64_crypto_tag_start_byte_%=\n\t"
         "ldrh	w9, [%x[authTag]], #2\n\t"
         "sub	x10, x10, #2\n\t"
         "strh	w9, [%x[tag]], #2\n\t"
         "\n"
-    "L_aes_gcm_decrypt_final_arm64_crypto_tag_start_byte_%=: \n\t"
+    "L_aes_gcm_decrypt_final_arm64_crypto_tag_start_byte_%=:\n\t"
         "cbz	x10, L_aes_gcm_decrypt_final_arm64_crypto_tag_end_bytes_%=\n\t"
         "ldrb	w9, [%x[authTag]], #1\n\t"
         "subs	x10, x10, #1\n\t"
         "strb	w9, [%x[tag]], #1\n\t"
         "b.ne	L_aes_gcm_decrypt_final_arm64_crypto_tag_start_byte_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_final_arm64_crypto_tag_end_bytes_%=: \n\t"
+    "L_aes_gcm_decrypt_final_arm64_crypto_tag_end_bytes_%=:\n\t"
         "sub	%x[tag], %x[tag], %x[tbytes]\n\t"
         "ld1	{v0.2d}, [%x[tag]]\n\t"
         "mov	x10, #16\n\t"
@@ -33107,14 +33113,14 @@ void AES_GCM_decrypt_final_AARCH64(byte* tag, const byte* authTag,
         "sub	x10, x10, %x[tbytes]\n\t"
         "add	%x[tag], %x[tag], %x[tbytes]\n\t"
         "\n"
-    "L_aes_gcm_decrypt_final_arm64_crypto_calc_tag_byte_%=: \n\t"
+    "L_aes_gcm_decrypt_final_arm64_crypto_calc_tag_byte_%=:\n\t"
         "strb	wzr, [%x[tag]], #1\n\t"
         "subs	x10, x10, #1\n\t"
         "b.ne	L_aes_gcm_decrypt_final_arm64_crypto_calc_tag_byte_%=\n\t"
         "subs	%x[tag], %x[tag], #16\n\t"
         "ld1	{v5.2d}, [%x[tag]]\n\t"
         "\n"
-    "L_aes_gcm_decrypt_final_arm64_crypto_tag_loaded_%=: \n\t"
+    "L_aes_gcm_decrypt_final_arm64_crypto_tag_loaded_%=:\n\t"
         "eor	v0.16b, v0.16b, v5.16b\n\t"
         "mov	x9, v0.d[0]\n\t"
         "mov	x10, v0.d[1]\n\t"
@@ -33154,12 +33160,12 @@ void AES_GCM_init_AARCH64_EOR3(byte* key, int nr, const byte* nonce,
         "mov	w8, #1\n\t"
         "b	L_aes_gcm_init_arm64_crypto_eor3_done_nonce_%=\n\t"
         "\n"
-    "L_aes_gcm_init_arm64_crypto_eor3_ghash_nonce_%=: \n\t"
+    "L_aes_gcm_init_arm64_crypto_eor3_ghash_nonce_%=:\n\t"
         "eor	v4.16b, v4.16b, v4.16b\n\t"
         "lsr	w7, %w[nonceSz], #4\n\t"
         "cbz	w7, L_aes_gcm_init_arm64_crypto_eor3_done_%=\n\t"
         "\n"
-    "L_aes_gcm_init_arm64_crypto_eor3_start_1_%=: \n\t"
+    "L_aes_gcm_init_arm64_crypto_eor3_start_1_%=:\n\t"
         "ld1	{v0.16b}, [%x[nonce]], #16\n\t"
         "rbit	v0.16b, v0.16b\n\t"
         "eor	v3.16b, v4.16b, v0.16b\n\t"
@@ -33181,7 +33187,7 @@ void AES_GCM_init_AARCH64_EOR3(byte* key, int nr, const byte* nonce,
         "subs	w7, w7, #1\n\t"
         "b.ne	L_aes_gcm_init_arm64_crypto_eor3_start_1_%=\n\t"
         "\n"
-    "L_aes_gcm_init_arm64_crypto_eor3_done_%=: \n\t"
+    "L_aes_gcm_init_arm64_crypto_eor3_done_%=:\n\t"
         "and	w13, %w[nonceSz], #15\n\t"
         "cbz	x13, L_aes_gcm_init_arm64_crypto_eor3_partial_done_%=\n\t"
         "eor	v7.16b, v7.16b, v7.16b\n\t"
@@ -33193,28 +33199,28 @@ void AES_GCM_init_AARCH64_EOR3(byte* key, int nr, const byte* nonce,
         "sub	w12, w12, #8\n\t"
         "str	x11, [%x[initCtr]], #8\n\t"
         "\n"
-    "L_aes_gcm_init_arm64_crypto_eor3_start_dw_%=: \n\t"
+    "L_aes_gcm_init_arm64_crypto_eor3_start_dw_%=:\n\t"
         "cmp	w12, #4\n\t"
         "b.lt	L_aes_gcm_init_arm64_crypto_eor3_start_sw_%=\n\t"
         "ldr	w11, [%x[nonce]], #4\n\t"
         "sub	w12, w12, #4\n\t"
         "str	w11, [%x[initCtr]], #4\n\t"
         "\n"
-    "L_aes_gcm_init_arm64_crypto_eor3_start_sw_%=: \n\t"
+    "L_aes_gcm_init_arm64_crypto_eor3_start_sw_%=:\n\t"
         "cmp	w12, #2\n\t"
         "b.lt	L_aes_gcm_init_arm64_crypto_eor3_start_byte_%=\n\t"
         "ldrh	w11, [%x[nonce]], #2\n\t"
         "sub	w12, w12, #2\n\t"
         "strh	w11, [%x[initCtr]], #2\n\t"
         "\n"
-    "L_aes_gcm_init_arm64_crypto_eor3_start_byte_%=: \n\t"
+    "L_aes_gcm_init_arm64_crypto_eor3_start_byte_%=:\n\t"
         "cbz	w12, L_aes_gcm_init_arm64_crypto_eor3_end_bytes_%=\n\t"
         "ldrb	w11, [%x[nonce]], #1\n\t"
         "subs	w12, w12, #1\n\t"
         "strb	w11, [%x[initCtr]], #1\n\t"
         "b.ne	L_aes_gcm_init_arm64_crypto_eor3_start_byte_%=\n\t"
         "\n"
-    "L_aes_gcm_init_arm64_crypto_eor3_end_bytes_%=: \n\t"
+    "L_aes_gcm_init_arm64_crypto_eor3_end_bytes_%=:\n\t"
         "sub	%x[initCtr], %x[initCtr], x13\n\t"
         "ld1	{v0.2d}, [%x[initCtr]]\n\t"
         "rbit	v0.16b, v0.16b\n\t"
@@ -33235,7 +33241,7 @@ void AES_GCM_init_AARCH64_EOR3(byte* key, int nr, const byte* nonce,
         "eor	v4.16b, v7.16b, v9.16b\n\t"
         /* Done GHASH */
         "\n"
-    "L_aes_gcm_init_arm64_crypto_eor3_partial_done_%=: \n\t"
+    "L_aes_gcm_init_arm64_crypto_eor3_partial_done_%=:\n\t"
         "eor	x7, x7, x7\n\t"
         "lsl	x13, %x[nonceSz], #3\n\t"
         "mov	v7.d[0], x7\n\t"
@@ -33259,7 +33265,7 @@ void AES_GCM_init_AARCH64_EOR3(byte* key, int nr, const byte* nonce,
         "mov	w8, v4.s[3]\n\t"
         "rev	w8, w8\n\t"
         "\n"
-    "L_aes_gcm_init_arm64_crypto_eor3_done_nonce_%=: \n\t"
+    "L_aes_gcm_init_arm64_crypto_eor3_done_nonce_%=:\n\t"
         "st1	{v4.2d}, [%x[counter]]\n\t"
         "ld1	{v7.2d, v8.2d, v9.2d, v10.2d}, [%x[key]], #0x40\n\t"
         "aese	v4.16b, v7.16b\n\t"
@@ -33298,7 +33304,7 @@ void AES_GCM_init_AARCH64_EOR3(byte* key, int nr, const byte* nonce,
         "aesmc	v4.16b, v4.16b\n\t"
         "aese	v4.16b, v8.16b\n\t"
         "\n"
-    "L_aes_gcm_init_arm64_crypto_eor3_round_done_%=: \n\t"
+    "L_aes_gcm_init_arm64_crypto_eor3_round_done_%=:\n\t"
         "ld1	{v7.2d}, [%x[key]]\n\t"
         "eor	v4.16b, v4.16b, v7.16b\n\t"
         "st1	{v4.2d}, [%x[initCtr]]\n\t"
@@ -33438,7 +33444,7 @@ void AES_GCM_aad_update_AARCH64_EOR3(const byte* aadt, word32 abytes, byte* tag,
         "eor	v19.16b, v10.16b, v11.16b\n\t"
         /* Done */
         "\n"
-    "L_aes_gcm_aad_update_arm64_crypto_eor3_h_done_%=: \n\t"
+    "L_aes_gcm_aad_update_arm64_crypto_eor3_h_done_%=:\n\t"
         "lsr	%w[abytes], %w[abytes], #4\n\t"
         "cmp	%w[abytes], #4\n\t"
         "b.lt	L_aes_gcm_aad_update_arm64_crypto_eor3_start_1_%=\n\t"
@@ -33447,7 +33453,7 @@ void AES_GCM_aad_update_AARCH64_EOR3(const byte* aadt, word32 abytes, byte* tag,
         "cmp	%w[abytes], #0x40\n\t"
         "b.lt	L_aes_gcm_aad_update_arm64_crypto_eor3_start_4_%=\n\t"
         "\n"
-    "L_aes_gcm_aad_update_arm64_crypto_eor3_start_8_%=: \n\t"
+    "L_aes_gcm_aad_update_arm64_crypto_eor3_start_8_%=:\n\t"
         "ld1	{v0.16b, v1.16b, v2.16b, v3.16b}, [%x[aadt]], #0x40\n\t"
         "ld1	{v4.16b, v5.16b, v6.16b, v7.16b}, [%x[aadt]], #0x40\n\t"
         "rbit	v0.16b, v0.16b\n\t"
@@ -33546,7 +33552,7 @@ void AES_GCM_aad_update_AARCH64_EOR3(const byte* aadt, word32 abytes, byte* tag,
         "cmp	%w[abytes], #16\n\t"
         "b.lt	L_aes_gcm_aad_update_arm64_crypto_eor3_start_2_%=\n\t"
         "\n"
-    "L_aes_gcm_aad_update_arm64_crypto_eor3_start_4_%=: \n\t"
+    "L_aes_gcm_aad_update_arm64_crypto_eor3_start_4_%=:\n\t"
         "ld1	{v0.16b, v1.16b, v2.16b, v3.16b}, [%x[aadt]], #0x40\n\t"
         "rbit	v0.16b, v0.16b\n\t"
         "rbit	v1.16b, v1.16b\n\t"
@@ -33602,7 +33608,7 @@ void AES_GCM_aad_update_AARCH64_EOR3(const byte* aadt, word32 abytes, byte* tag,
         "b.lt	L_aes_gcm_aad_update_arm64_crypto_eor3_done_%=\n\t"
         "b.eq	L_aes_gcm_aad_update_arm64_crypto_eor3_start_1_%=\n\t"
         "\n"
-    "L_aes_gcm_aad_update_arm64_crypto_eor3_start_2_%=: \n\t"
+    "L_aes_gcm_aad_update_arm64_crypto_eor3_start_2_%=:\n\t"
         "ld1	{v0.16b, v1.16b}, [%x[aadt]], #32\n\t"
         "rbit	v0.16b, v0.16b\n\t"
         "rbit	v1.16b, v1.16b\n\t"
@@ -33636,10 +33642,10 @@ void AES_GCM_aad_update_AARCH64_EOR3(const byte* aadt, word32 abytes, byte* tag,
         "b.gt	L_aes_gcm_aad_update_arm64_crypto_eor3_start_2_%=\n\t"
         "b.lt	L_aes_gcm_aad_update_arm64_crypto_eor3_done_%=\n\t"
         "\n"
-    "L_aes_gcm_aad_update_arm64_crypto_eor3_start_1_%=: \n\t"
+    "L_aes_gcm_aad_update_arm64_crypto_eor3_start_1_%=:\n\t"
         "cbz	%w[abytes], L_aes_gcm_aad_update_arm64_crypto_eor3_done_%=\n\t"
         "\n"
-    "L_aes_gcm_aad_update_arm64_crypto_eor3_both_1_%=: \n\t"
+    "L_aes_gcm_aad_update_arm64_crypto_eor3_both_1_%=:\n\t"
         "ld1	{v0.16b}, [%x[aadt]], #16\n\t"
         "rbit	v0.16b, v0.16b\n\t"
         "eor	v3.16b, v20.16b, v0.16b\n\t"
@@ -33661,7 +33667,7 @@ void AES_GCM_aad_update_AARCH64_EOR3(const byte* aadt, word32 abytes, byte* tag,
         "subs	%w[abytes], %w[abytes], #1\n\t"
         "b.ne	L_aes_gcm_aad_update_arm64_crypto_eor3_both_1_%=\n\t"
         "\n"
-    "L_aes_gcm_aad_update_arm64_crypto_eor3_done_%=: \n\t"
+    "L_aes_gcm_aad_update_arm64_crypto_eor3_done_%=:\n\t"
         "st1	{v20.2d}, [%x[tag]]\n\t"
         : [abytes] "+r" (abytes), [tag] "+r" (tag), [gcm_h] "+r" (gcm_h)
         : [aadt] "r" (aadt)
@@ -33720,7 +33726,7 @@ void AES_GCM_encrypt_block_AARCH64_EOR3(const byte* key, int nr, byte* out,
         "aesmc	v5.16b, v5.16b\n\t"
         "aese	v5.16b, v1.16b\n\t"
         "\n"
-    "L_aes_gcm_encrypt_block_arm64_crypto_eor3_round_done_%=: \n\t"
+    "L_aes_gcm_encrypt_block_arm64_crypto_eor3_round_done_%=:\n\t"
         "ld1	{v0.2d}, [%x[key]]\n\t"
         "eor	v5.16b, v5.16b, v0.16b\n\t"
         "eor	v4.16b, v4.16b, v5.16b\n\t"
@@ -33831,7 +33837,7 @@ void AES_GCM_encrypt_update_AARCH64_EOR3(const byte* key, int nr, byte* out,
         "eor	v7.16b, v30.16b, v31.16b\n\t"
         /* Done */
         "\n"
-    "L_aes_gcm_encrypt_update_arm64_crypto_eor3_h_done_%=: \n\t"
+    "L_aes_gcm_encrypt_update_arm64_crypto_eor3_h_done_%=:\n\t"
         "lsr	w8, %w[nbytes], #4\n\t"
         "cmp	%w[nr], #12\n\t"
         "b.lt	L_aes_gcm_encrypt_update_arm64_crypto_eor3_start_128_%=\n\t"
@@ -33841,7 +33847,7 @@ void AES_GCM_encrypt_update_AARCH64_EOR3(const byte* key, int nr, byte* out,
         "cmp	w8, #32\n\t"
         "b.lt	L_aes_gcm_encrypt_update_arm64_crypto_eor3_192_start_4_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_update_arm64_crypto_eor3_192_start_8_%=: \n\t"
+    "L_aes_gcm_encrypt_update_arm64_crypto_eor3_192_start_8_%=:\n\t"
         "ldr	q12, [%x[key]]\n\t"
         "add	w17, w9, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
@@ -34102,7 +34108,7 @@ void AES_GCM_encrypt_update_AARCH64_EOR3(const byte* key, int nr, byte* out,
         "cmp	w8, #8\n\t"
         "b.lt	L_aes_gcm_encrypt_update_arm64_crypto_eor3_192_end_8_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_update_arm64_crypto_eor3_192_both_8_%=: \n\t"
+    "L_aes_gcm_encrypt_update_arm64_crypto_eor3_192_both_8_%=:\n\t"
         "ldr	q12, [%x[key]]\n\t"
         "add	w17, w9, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
@@ -34450,7 +34456,7 @@ void AES_GCM_encrypt_update_AARCH64_EOR3(const byte* key, int nr, byte* out,
         "cmp	w8, #8\n\t"
         "b.ge	L_aes_gcm_encrypt_update_arm64_crypto_eor3_192_both_8_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_update_arm64_crypto_eor3_192_end_8_%=: \n\t"
+    "L_aes_gcm_encrypt_update_arm64_crypto_eor3_192_end_8_%=:\n\t"
         "rbit	v18.16b, v18.16b\n\t"
         "rbit	v19.16b, v19.16b\n\t"
         "rbit	v20.16b, v20.16b\n\t"
@@ -34539,7 +34545,7 @@ void AES_GCM_encrypt_update_AARCH64_EOR3(const byte* key, int nr, byte* out,
         "eor	v26.16b, v28.16b, v30.16b\n\t"
         /* Done GHASH */
         "\n"
-    "L_aes_gcm_encrypt_update_arm64_crypto_eor3_192_start_4_%=: \n\t"
+    "L_aes_gcm_encrypt_update_arm64_crypto_eor3_192_start_4_%=:\n\t"
         "ld1	{v0.2d, v1.2d, v2.2d, v3.2d}, [%x[key]], #0x40\n\t"
         "ld1	{v4.2d, v5.2d, v6.2d, v7.2d}, [%x[key]], #0x40\n\t"
         "ld1	{v8.2d, v9.2d, v10.2d, v11.2d}, [%x[key]], #0x40\n\t"
@@ -34674,7 +34680,7 @@ void AES_GCM_encrypt_update_AARCH64_EOR3(const byte* key, int nr, byte* out,
         "st1	{v18.16b, v19.16b, v20.16b, v21.16b}, [%x[out]], #0x40\n\t"
         "b.lt	L_aes_gcm_encrypt_update_arm64_crypto_eor3_192_end_4_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_update_arm64_crypto_eor3_192_both_4_%=: \n\t"
+    "L_aes_gcm_encrypt_update_arm64_crypto_eor3_192_both_4_%=:\n\t"
         "add	w13, w9, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
         "add	w12, w9, #2\n\t"
@@ -34844,7 +34850,7 @@ void AES_GCM_encrypt_update_AARCH64_EOR3(const byte* key, int nr, byte* out,
         "st1	{v18.16b, v19.16b, v20.16b, v21.16b}, [%x[out]], #0x40\n\t"
         "b.ge	L_aes_gcm_encrypt_update_arm64_crypto_eor3_192_both_4_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_update_arm64_crypto_eor3_192_end_4_%=: \n\t"
+    "L_aes_gcm_encrypt_update_arm64_crypto_eor3_192_end_4_%=:\n\t"
         "rbit	v18.16b, v18.16b\n\t"
         "rbit	v19.16b, v19.16b\n\t"
         "rbit	v20.16b, v20.16b\n\t"
@@ -34896,7 +34902,7 @@ void AES_GCM_encrypt_update_AARCH64_EOR3(const byte* key, int nr, byte* out,
         "b.eq	L_aes_gcm_encrypt_update_arm64_crypto_eor3_192_start_1_%=\n\t"
         "b.lt	L_aes_gcm_encrypt_update_arm64_crypto_eor3_192_done_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_update_arm64_crypto_eor3_192_start_2_%=: \n\t"
+    "L_aes_gcm_encrypt_update_arm64_crypto_eor3_192_start_2_%=:\n\t"
         "add	w13, w9, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
         "add	w9, w9, #2\n\t"
@@ -34988,7 +34994,7 @@ void AES_GCM_encrypt_update_AARCH64_EOR3(const byte* key, int nr, byte* out,
         /* Done GHASH */
         "cbz	w8, L_aes_gcm_encrypt_update_arm64_crypto_eor3_192_done_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_update_arm64_crypto_eor3_192_start_1_%=: \n\t"
+    "L_aes_gcm_encrypt_update_arm64_crypto_eor3_192_start_1_%=:\n\t"
         "add	w9, w9, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
         "rev	w10, w9\n\t"
@@ -35038,17 +35044,17 @@ void AES_GCM_encrypt_update_AARCH64_EOR3(const byte* key, int nr, byte* out,
         "eor	v26.16b, v28.16b, v30.16b\n\t"
         /* Done GHASH */
         "\n"
-    "L_aes_gcm_encrypt_update_arm64_crypto_eor3_192_done_%=: \n\t"
+    "L_aes_gcm_encrypt_update_arm64_crypto_eor3_192_done_%=:\n\t"
 #endif /* !NO_AES_192 */
         "b	L_aes_gcm_encrypt_update_arm64_crypto_eor3_done_%=\n\t"
         /* AES_GCM_256 */
         "\n"
-    "L_aes_gcm_encrypt_update_arm64_crypto_eor3_start_256_%=: \n\t"
+    "L_aes_gcm_encrypt_update_arm64_crypto_eor3_start_256_%=:\n\t"
 #ifndef NO_AES_256
         "cmp	w8, #32\n\t"
         "b.lt	L_aes_gcm_encrypt_update_arm64_crypto_eor3_256_start_4_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_update_arm64_crypto_eor3_256_start_8_%=: \n\t"
+    "L_aes_gcm_encrypt_update_arm64_crypto_eor3_256_start_8_%=:\n\t"
         "ldr	q12, [%x[key]]\n\t"
         "add	w17, w9, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
@@ -35343,7 +35349,7 @@ void AES_GCM_encrypt_update_AARCH64_EOR3(const byte* key, int nr, byte* out,
         "cmp	w8, #8\n\t"
         "b.lt	L_aes_gcm_encrypt_update_arm64_crypto_eor3_256_end_8_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_update_arm64_crypto_eor3_256_both_8_%=: \n\t"
+    "L_aes_gcm_encrypt_update_arm64_crypto_eor3_256_both_8_%=:\n\t"
         "ldr	q12, [%x[key]]\n\t"
         "add	w17, w9, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
@@ -35725,7 +35731,7 @@ void AES_GCM_encrypt_update_AARCH64_EOR3(const byte* key, int nr, byte* out,
         "cmp	w8, #8\n\t"
         "b.ge	L_aes_gcm_encrypt_update_arm64_crypto_eor3_256_both_8_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_update_arm64_crypto_eor3_256_end_8_%=: \n\t"
+    "L_aes_gcm_encrypt_update_arm64_crypto_eor3_256_end_8_%=:\n\t"
         "rbit	v18.16b, v18.16b\n\t"
         "rbit	v19.16b, v19.16b\n\t"
         "rbit	v20.16b, v20.16b\n\t"
@@ -35814,7 +35820,7 @@ void AES_GCM_encrypt_update_AARCH64_EOR3(const byte* key, int nr, byte* out,
         "eor	v26.16b, v28.16b, v30.16b\n\t"
         /* Done GHASH */
         "\n"
-    "L_aes_gcm_encrypt_update_arm64_crypto_eor3_256_start_4_%=: \n\t"
+    "L_aes_gcm_encrypt_update_arm64_crypto_eor3_256_start_4_%=:\n\t"
         "ld1	{v0.2d, v1.2d, v2.2d, v3.2d}, [%x[key]], #0x40\n\t"
         "ld1	{v4.2d, v5.2d, v6.2d, v7.2d}, [%x[key]], #0x40\n\t"
         "ld1	{v8.2d, v9.2d, v10.2d, v11.2d}, [%x[key]], #0x40\n\t"
@@ -35966,7 +35972,7 @@ void AES_GCM_encrypt_update_AARCH64_EOR3(const byte* key, int nr, byte* out,
         "st1	{v18.16b, v19.16b, v20.16b, v21.16b}, [%x[out]], #0x40\n\t"
         "b.lt	L_aes_gcm_encrypt_update_arm64_crypto_eor3_256_end_4_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_update_arm64_crypto_eor3_256_both_4_%=: \n\t"
+    "L_aes_gcm_encrypt_update_arm64_crypto_eor3_256_both_4_%=:\n\t"
         "add	w13, w9, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
         "add	w12, w9, #2\n\t"
@@ -36153,7 +36159,7 @@ void AES_GCM_encrypt_update_AARCH64_EOR3(const byte* key, int nr, byte* out,
         "st1	{v18.16b, v19.16b, v20.16b, v21.16b}, [%x[out]], #0x40\n\t"
         "b.ge	L_aes_gcm_encrypt_update_arm64_crypto_eor3_256_both_4_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_update_arm64_crypto_eor3_256_end_4_%=: \n\t"
+    "L_aes_gcm_encrypt_update_arm64_crypto_eor3_256_end_4_%=:\n\t"
         "rbit	v18.16b, v18.16b\n\t"
         "rbit	v19.16b, v19.16b\n\t"
         "rbit	v20.16b, v20.16b\n\t"
@@ -36205,7 +36211,7 @@ void AES_GCM_encrypt_update_AARCH64_EOR3(const byte* key, int nr, byte* out,
         "b.eq	L_aes_gcm_encrypt_update_arm64_crypto_eor3_256_start_1_%=\n\t"
         "b.lt	L_aes_gcm_encrypt_update_arm64_crypto_eor3_256_done_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_update_arm64_crypto_eor3_256_start_2_%=: \n\t"
+    "L_aes_gcm_encrypt_update_arm64_crypto_eor3_256_start_2_%=:\n\t"
         "add	w13, w9, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
         "add	w9, w9, #2\n\t"
@@ -36306,7 +36312,7 @@ void AES_GCM_encrypt_update_AARCH64_EOR3(const byte* key, int nr, byte* out,
         /* Done GHASH */
         "cbz	w8, L_aes_gcm_encrypt_update_arm64_crypto_eor3_256_done_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_update_arm64_crypto_eor3_256_start_1_%=: \n\t"
+    "L_aes_gcm_encrypt_update_arm64_crypto_eor3_256_start_1_%=:\n\t"
         "add	w9, w9, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
         "rev	w10, w9\n\t"
@@ -36362,17 +36368,17 @@ void AES_GCM_encrypt_update_AARCH64_EOR3(const byte* key, int nr, byte* out,
         "eor	v26.16b, v28.16b, v30.16b\n\t"
         /* Done GHASH */
         "\n"
-    "L_aes_gcm_encrypt_update_arm64_crypto_eor3_256_done_%=: \n\t"
+    "L_aes_gcm_encrypt_update_arm64_crypto_eor3_256_done_%=:\n\t"
 #endif /* !NO_AES_256 */
         "b	L_aes_gcm_encrypt_update_arm64_crypto_eor3_done_%=\n\t"
         /* AES_GCM_128 */
         "\n"
-    "L_aes_gcm_encrypt_update_arm64_crypto_eor3_start_128_%=: \n\t"
+    "L_aes_gcm_encrypt_update_arm64_crypto_eor3_start_128_%=:\n\t"
 #ifndef NO_AES_128
         "cmp	w8, #32\n\t"
         "b.lt	L_aes_gcm_encrypt_update_arm64_crypto_eor3_128_start_4_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_update_arm64_crypto_eor3_128_start_8_%=: \n\t"
+    "L_aes_gcm_encrypt_update_arm64_crypto_eor3_128_start_8_%=:\n\t"
         "ldr	q12, [%x[key]]\n\t"
         "add	w17, w9, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
@@ -36599,7 +36605,7 @@ void AES_GCM_encrypt_update_AARCH64_EOR3(const byte* key, int nr, byte* out,
         "cmp	w8, #8\n\t"
         "b.lt	L_aes_gcm_encrypt_update_arm64_crypto_eor3_128_end_8_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_update_arm64_crypto_eor3_128_both_8_%=: \n\t"
+    "L_aes_gcm_encrypt_update_arm64_crypto_eor3_128_both_8_%=:\n\t"
         "ldr	q12, [%x[key]]\n\t"
         "add	w17, w9, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
@@ -36913,7 +36919,7 @@ void AES_GCM_encrypt_update_AARCH64_EOR3(const byte* key, int nr, byte* out,
         "cmp	w8, #8\n\t"
         "b.ge	L_aes_gcm_encrypt_update_arm64_crypto_eor3_128_both_8_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_update_arm64_crypto_eor3_128_end_8_%=: \n\t"
+    "L_aes_gcm_encrypt_update_arm64_crypto_eor3_128_end_8_%=:\n\t"
         "rbit	v18.16b, v18.16b\n\t"
         "rbit	v19.16b, v19.16b\n\t"
         "rbit	v20.16b, v20.16b\n\t"
@@ -37002,7 +37008,7 @@ void AES_GCM_encrypt_update_AARCH64_EOR3(const byte* key, int nr, byte* out,
         "eor	v26.16b, v28.16b, v30.16b\n\t"
         /* Done GHASH */
         "\n"
-    "L_aes_gcm_encrypt_update_arm64_crypto_eor3_128_start_4_%=: \n\t"
+    "L_aes_gcm_encrypt_update_arm64_crypto_eor3_128_start_4_%=:\n\t"
         "ld1	{v0.2d, v1.2d, v2.2d, v3.2d}, [%x[key]], #0x40\n\t"
         "ld1	{v4.2d, v5.2d, v6.2d, v7.2d}, [%x[key]], #0x40\n\t"
         "ld1	{v8.2d, v9.2d}, [%x[key]], #32\n\t"
@@ -37121,7 +37127,7 @@ void AES_GCM_encrypt_update_AARCH64_EOR3(const byte* key, int nr, byte* out,
         "st1	{v18.16b, v19.16b, v20.16b, v21.16b}, [%x[out]], #0x40\n\t"
         "b.lt	L_aes_gcm_encrypt_update_arm64_crypto_eor3_128_end_4_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_update_arm64_crypto_eor3_128_both_4_%=: \n\t"
+    "L_aes_gcm_encrypt_update_arm64_crypto_eor3_128_both_4_%=:\n\t"
         "add	w13, w9, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
         "add	w12, w9, #2\n\t"
@@ -37275,7 +37281,7 @@ void AES_GCM_encrypt_update_AARCH64_EOR3(const byte* key, int nr, byte* out,
         "st1	{v18.16b, v19.16b, v20.16b, v21.16b}, [%x[out]], #0x40\n\t"
         "b.ge	L_aes_gcm_encrypt_update_arm64_crypto_eor3_128_both_4_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_update_arm64_crypto_eor3_128_end_4_%=: \n\t"
+    "L_aes_gcm_encrypt_update_arm64_crypto_eor3_128_end_4_%=:\n\t"
         "rbit	v18.16b, v18.16b\n\t"
         "rbit	v19.16b, v19.16b\n\t"
         "rbit	v20.16b, v20.16b\n\t"
@@ -37327,7 +37333,7 @@ void AES_GCM_encrypt_update_AARCH64_EOR3(const byte* key, int nr, byte* out,
         "b.eq	L_aes_gcm_encrypt_update_arm64_crypto_eor3_128_start_1_%=\n\t"
         "b.lt	L_aes_gcm_encrypt_update_arm64_crypto_eor3_128_done_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_update_arm64_crypto_eor3_128_start_2_%=: \n\t"
+    "L_aes_gcm_encrypt_update_arm64_crypto_eor3_128_start_2_%=:\n\t"
         "add	w13, w9, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
         "add	w9, w9, #2\n\t"
@@ -37411,7 +37417,7 @@ void AES_GCM_encrypt_update_AARCH64_EOR3(const byte* key, int nr, byte* out,
         /* Done GHASH */
         "cbz	w8, L_aes_gcm_encrypt_update_arm64_crypto_eor3_128_done_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_update_arm64_crypto_eor3_128_start_1_%=: \n\t"
+    "L_aes_gcm_encrypt_update_arm64_crypto_eor3_128_start_1_%=:\n\t"
         "add	w9, w9, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
         "rev	w10, w9\n\t"
@@ -37457,10 +37463,10 @@ void AES_GCM_encrypt_update_AARCH64_EOR3(const byte* key, int nr, byte* out,
         "eor	v26.16b, v28.16b, v30.16b\n\t"
         /* Done GHASH */
         "\n"
-    "L_aes_gcm_encrypt_update_arm64_crypto_eor3_128_done_%=: \n\t"
+    "L_aes_gcm_encrypt_update_arm64_crypto_eor3_128_done_%=:\n\t"
 #endif /* !NO_AES_128 */
         "\n"
-    "L_aes_gcm_encrypt_update_arm64_crypto_eor3_done_%=: \n\t"
+    "L_aes_gcm_encrypt_update_arm64_crypto_eor3_done_%=:\n\t"
         "rev	w9, w9\n\t"
         "mov	v13.s[3], w9\n\t"
         "st1	{v26.2d}, [%x[tag]]\n\t"
@@ -37513,7 +37519,7 @@ void AES_GCM_encrypt_final_AARCH64_EOR3(byte* tag, byte* authTag, word32 tbytes,
         "st1	{v5.16b}, [%x[authTag]]\n\t"
         "b	L_aes_gcm_encrypt_final_arm64_crypto_eor3_done_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_final_arm64_crypto_eor3_tag_partial_%=: \n\t"
+    "L_aes_gcm_encrypt_final_arm64_crypto_eor3_tag_partial_%=:\n\t"
         "st1	{v5.16b}, [%x[tag]]\n\t"
         "cmp	%w[tbytes], #8\n\t"
         "b.lt	L_aes_gcm_encrypt_final_arm64_crypto_eor3_tag_start_dw_%=\n\t"
@@ -37521,30 +37527,30 @@ void AES_GCM_encrypt_final_AARCH64_EOR3(byte* tag, byte* authTag, word32 tbytes,
         "sub	%w[tbytes], %w[tbytes], #8\n\t"
         "str	x8, [%x[authTag]], #8\n\t"
         "\n"
-    "L_aes_gcm_encrypt_final_arm64_crypto_eor3_tag_start_dw_%=: \n\t"
+    "L_aes_gcm_encrypt_final_arm64_crypto_eor3_tag_start_dw_%=:\n\t"
         "cmp	%w[tbytes], #4\n\t"
         "b.lt	L_aes_gcm_encrypt_final_arm64_crypto_eor3_tag_start_sw_%=\n\t"
         "ldr	w8, [%x[tag]], #4\n\t"
         "sub	%w[tbytes], %w[tbytes], #4\n\t"
         "str	w8, [%x[authTag]], #4\n\t"
         "\n"
-    "L_aes_gcm_encrypt_final_arm64_crypto_eor3_tag_start_sw_%=: \n\t"
+    "L_aes_gcm_encrypt_final_arm64_crypto_eor3_tag_start_sw_%=:\n\t"
         "cmp	%w[tbytes], #2\n\t"
         "b.lt	L_aes_gcm_encrypt_final_arm64_crypto_eor3_tag_start_byte_%=\n\t"
         "ldrh	w8, [%x[tag]], #2\n\t"
         "sub	%w[tbytes], %w[tbytes], #2\n\t"
         "strh	w8, [%x[authTag]], #2\n\t"
         "\n"
-    "L_aes_gcm_encrypt_final_arm64_crypto_eor3_tag_start_byte_%=: \n\t"
+    "L_aes_gcm_encrypt_final_arm64_crypto_eor3_tag_start_byte_%=:\n\t"
         "cbz	%w[tbytes], L_aes_gcm_encrypt_final_arm64_crypto_eor3_tag_end_bytes_%=\n\t"
         "ldrb	w8, [%x[tag]], #1\n\t"
         "subs	%w[tbytes], %w[tbytes], #1\n\t"
         "strb	w8, [%x[authTag]], #1\n\t"
         "b.ne	L_aes_gcm_encrypt_final_arm64_crypto_eor3_tag_start_byte_%=\n\t"
         "\n"
-    "L_aes_gcm_encrypt_final_arm64_crypto_eor3_tag_end_bytes_%=: \n\t"
+    "L_aes_gcm_encrypt_final_arm64_crypto_eor3_tag_end_bytes_%=:\n\t"
         "\n"
-    "L_aes_gcm_encrypt_final_arm64_crypto_eor3_done_%=: \n\t"
+    "L_aes_gcm_encrypt_final_arm64_crypto_eor3_done_%=:\n\t"
         : [tag] "+r" (tag), [authTag] "+r" (authTag), [tbytes] "+r" (tbytes),
           [nbytes] "+r" (nbytes), [abytes] "+r" (abytes), [h] "+r" (h),
           [initCtr] "+r" (initCtr)
@@ -37654,7 +37660,7 @@ void AES_GCM_decrypt_update_AARCH64_EOR3(const byte* key, int nr, byte* out,
         "eor	v7.16b, v30.16b, v31.16b\n\t"
         /* Done */
         "\n"
-    "L_aes_gcm_decrypt_update_arm64_crypto_eor3_h_done_%=: \n\t"
+    "L_aes_gcm_decrypt_update_arm64_crypto_eor3_h_done_%=:\n\t"
         "lsr	w8, %w[nbytes], #4\n\t"
         "cmp	%w[nr], #12\n\t"
         "b.lt	L_aes_gcm_decrypt_update_arm64_crypto_eor3_start_128_%=\n\t"
@@ -37664,7 +37670,7 @@ void AES_GCM_decrypt_update_AARCH64_EOR3(const byte* key, int nr, byte* out,
         "cmp	w8, #32\n\t"
         "b.lt	L_aes_gcm_decrypt_update_arm64_crypto_eor3_192_start_4_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_update_arm64_crypto_eor3_192_start_8_%=: \n\t"
+    "L_aes_gcm_decrypt_update_arm64_crypto_eor3_192_start_8_%=:\n\t"
         "ldr	q12, [%x[key]]\n\t"
         "add	w17, w9, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
@@ -37925,7 +37931,7 @@ void AES_GCM_decrypt_update_AARCH64_EOR3(const byte* key, int nr, byte* out,
         "cmp	w8, #8\n\t"
         "b.lt	L_aes_gcm_decrypt_update_arm64_crypto_eor3_192_end_8_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_update_arm64_crypto_eor3_192_both_8_%=: \n\t"
+    "L_aes_gcm_decrypt_update_arm64_crypto_eor3_192_both_8_%=:\n\t"
         "ldr	q12, [%x[key]]\n\t"
         "add	w17, w9, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
@@ -38273,7 +38279,7 @@ void AES_GCM_decrypt_update_AARCH64_EOR3(const byte* key, int nr, byte* out,
         "cmp	w8, #8\n\t"
         "b.ge	L_aes_gcm_decrypt_update_arm64_crypto_eor3_192_both_8_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_update_arm64_crypto_eor3_192_end_8_%=: \n\t"
+    "L_aes_gcm_decrypt_update_arm64_crypto_eor3_192_end_8_%=:\n\t"
         "rbit	v18.16b, v18.16b\n\t"
         "rbit	v19.16b, v19.16b\n\t"
         "rbit	v20.16b, v20.16b\n\t"
@@ -38362,7 +38368,7 @@ void AES_GCM_decrypt_update_AARCH64_EOR3(const byte* key, int nr, byte* out,
         "eor	v26.16b, v28.16b, v30.16b\n\t"
         /* Done GHASH */
         "\n"
-    "L_aes_gcm_decrypt_update_arm64_crypto_eor3_192_start_4_%=: \n\t"
+    "L_aes_gcm_decrypt_update_arm64_crypto_eor3_192_start_4_%=:\n\t"
         "ld1	{v0.2d, v1.2d, v2.2d, v3.2d}, [%x[key]], #0x40\n\t"
         "ld1	{v4.2d, v5.2d, v6.2d, v7.2d}, [%x[key]], #0x40\n\t"
         "ld1	{v8.2d, v9.2d, v10.2d, v11.2d}, [%x[key]], #0x40\n\t"
@@ -38497,7 +38503,7 @@ void AES_GCM_decrypt_update_AARCH64_EOR3(const byte* key, int nr, byte* out,
         "st1	{v14.16b, v15.16b, v16.16b, v17.16b}, [%x[out]], #0x40\n\t"
         "b.lt	L_aes_gcm_decrypt_update_arm64_crypto_eor3_192_end_4_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_update_arm64_crypto_eor3_192_both_4_%=: \n\t"
+    "L_aes_gcm_decrypt_update_arm64_crypto_eor3_192_both_4_%=:\n\t"
         "add	w13, w9, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
         "add	w12, w9, #2\n\t"
@@ -38667,7 +38673,7 @@ void AES_GCM_decrypt_update_AARCH64_EOR3(const byte* key, int nr, byte* out,
         "st1	{v14.16b, v15.16b, v16.16b, v17.16b}, [%x[out]], #0x40\n\t"
         "b.ge	L_aes_gcm_decrypt_update_arm64_crypto_eor3_192_both_4_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_update_arm64_crypto_eor3_192_end_4_%=: \n\t"
+    "L_aes_gcm_decrypt_update_arm64_crypto_eor3_192_end_4_%=:\n\t"
         "rbit	v18.16b, v18.16b\n\t"
         "rbit	v19.16b, v19.16b\n\t"
         "rbit	v20.16b, v20.16b\n\t"
@@ -38719,7 +38725,7 @@ void AES_GCM_decrypt_update_AARCH64_EOR3(const byte* key, int nr, byte* out,
         "b.eq	L_aes_gcm_decrypt_update_arm64_crypto_eor3_192_start_1_%=\n\t"
         "b.lt	L_aes_gcm_decrypt_update_arm64_crypto_eor3_192_done_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_update_arm64_crypto_eor3_192_start_2_%=: \n\t"
+    "L_aes_gcm_decrypt_update_arm64_crypto_eor3_192_start_2_%=:\n\t"
         "add	w13, w9, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
         "add	w9, w9, #2\n\t"
@@ -38811,7 +38817,7 @@ void AES_GCM_decrypt_update_AARCH64_EOR3(const byte* key, int nr, byte* out,
         /* Done GHASH */
         "cbz	w8, L_aes_gcm_decrypt_update_arm64_crypto_eor3_192_done_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_update_arm64_crypto_eor3_192_start_1_%=: \n\t"
+    "L_aes_gcm_decrypt_update_arm64_crypto_eor3_192_start_1_%=:\n\t"
         "ld1	{v15.16b}, [%x[in]], #16\n\t"
         "add	w9, w9, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
@@ -38862,17 +38868,17 @@ void AES_GCM_decrypt_update_AARCH64_EOR3(const byte* key, int nr, byte* out,
         "eor	v14.16b, v14.16b, v15.16b\n\t"
         "st1	{v14.16b}, [%x[out]], #16\n\t"
         "\n"
-    "L_aes_gcm_decrypt_update_arm64_crypto_eor3_192_done_%=: \n\t"
+    "L_aes_gcm_decrypt_update_arm64_crypto_eor3_192_done_%=:\n\t"
 #endif /* !NO_AES_192 */
         "b	L_aes_gcm_decrypt_update_arm64_crypto_eor3_done_%=\n\t"
         /* AES_GCM_256 */
         "\n"
-    "L_aes_gcm_decrypt_update_arm64_crypto_eor3_start_256_%=: \n\t"
+    "L_aes_gcm_decrypt_update_arm64_crypto_eor3_start_256_%=:\n\t"
 #ifndef NO_AES_256
         "cmp	w8, #32\n\t"
         "b.lt	L_aes_gcm_decrypt_update_arm64_crypto_eor3_256_start_4_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_update_arm64_crypto_eor3_256_start_8_%=: \n\t"
+    "L_aes_gcm_decrypt_update_arm64_crypto_eor3_256_start_8_%=:\n\t"
         "ldr	q12, [%x[key]]\n\t"
         "add	w17, w9, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
@@ -39167,7 +39173,7 @@ void AES_GCM_decrypt_update_AARCH64_EOR3(const byte* key, int nr, byte* out,
         "cmp	w8, #8\n\t"
         "b.lt	L_aes_gcm_decrypt_update_arm64_crypto_eor3_256_end_8_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_update_arm64_crypto_eor3_256_both_8_%=: \n\t"
+    "L_aes_gcm_decrypt_update_arm64_crypto_eor3_256_both_8_%=:\n\t"
         "ldr	q12, [%x[key]]\n\t"
         "add	w17, w9, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
@@ -39549,7 +39555,7 @@ void AES_GCM_decrypt_update_AARCH64_EOR3(const byte* key, int nr, byte* out,
         "cmp	w8, #8\n\t"
         "b.ge	L_aes_gcm_decrypt_update_arm64_crypto_eor3_256_both_8_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_update_arm64_crypto_eor3_256_end_8_%=: \n\t"
+    "L_aes_gcm_decrypt_update_arm64_crypto_eor3_256_end_8_%=:\n\t"
         "rbit	v18.16b, v18.16b\n\t"
         "rbit	v19.16b, v19.16b\n\t"
         "rbit	v20.16b, v20.16b\n\t"
@@ -39638,7 +39644,7 @@ void AES_GCM_decrypt_update_AARCH64_EOR3(const byte* key, int nr, byte* out,
         "eor	v26.16b, v28.16b, v30.16b\n\t"
         /* Done GHASH */
         "\n"
-    "L_aes_gcm_decrypt_update_arm64_crypto_eor3_256_start_4_%=: \n\t"
+    "L_aes_gcm_decrypt_update_arm64_crypto_eor3_256_start_4_%=:\n\t"
         "ld1	{v0.2d, v1.2d, v2.2d, v3.2d}, [%x[key]], #0x40\n\t"
         "ld1	{v4.2d, v5.2d, v6.2d, v7.2d}, [%x[key]], #0x40\n\t"
         "ld1	{v8.2d, v9.2d, v10.2d, v11.2d}, [%x[key]], #0x40\n\t"
@@ -39790,7 +39796,7 @@ void AES_GCM_decrypt_update_AARCH64_EOR3(const byte* key, int nr, byte* out,
         "st1	{v14.16b, v15.16b, v16.16b, v17.16b}, [%x[out]], #0x40\n\t"
         "b.lt	L_aes_gcm_decrypt_update_arm64_crypto_eor3_256_end_4_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_update_arm64_crypto_eor3_256_both_4_%=: \n\t"
+    "L_aes_gcm_decrypt_update_arm64_crypto_eor3_256_both_4_%=:\n\t"
         "add	w13, w9, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
         "add	w12, w9, #2\n\t"
@@ -39977,7 +39983,7 @@ void AES_GCM_decrypt_update_AARCH64_EOR3(const byte* key, int nr, byte* out,
         "st1	{v14.16b, v15.16b, v16.16b, v17.16b}, [%x[out]], #0x40\n\t"
         "b.ge	L_aes_gcm_decrypt_update_arm64_crypto_eor3_256_both_4_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_update_arm64_crypto_eor3_256_end_4_%=: \n\t"
+    "L_aes_gcm_decrypt_update_arm64_crypto_eor3_256_end_4_%=:\n\t"
         "rbit	v18.16b, v18.16b\n\t"
         "rbit	v19.16b, v19.16b\n\t"
         "rbit	v20.16b, v20.16b\n\t"
@@ -40029,7 +40035,7 @@ void AES_GCM_decrypt_update_AARCH64_EOR3(const byte* key, int nr, byte* out,
         "b.eq	L_aes_gcm_decrypt_update_arm64_crypto_eor3_256_start_1_%=\n\t"
         "b.lt	L_aes_gcm_decrypt_update_arm64_crypto_eor3_256_done_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_update_arm64_crypto_eor3_256_start_2_%=: \n\t"
+    "L_aes_gcm_decrypt_update_arm64_crypto_eor3_256_start_2_%=:\n\t"
         "add	w13, w9, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
         "add	w9, w9, #2\n\t"
@@ -40130,7 +40136,7 @@ void AES_GCM_decrypt_update_AARCH64_EOR3(const byte* key, int nr, byte* out,
         /* Done GHASH */
         "cbz	w8, L_aes_gcm_decrypt_update_arm64_crypto_eor3_256_done_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_update_arm64_crypto_eor3_256_start_1_%=: \n\t"
+    "L_aes_gcm_decrypt_update_arm64_crypto_eor3_256_start_1_%=:\n\t"
         "add	w9, w9, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
         "rev	w10, w9\n\t"
@@ -40186,17 +40192,17 @@ void AES_GCM_decrypt_update_AARCH64_EOR3(const byte* key, int nr, byte* out,
         "eor	v26.16b, v28.16b, v30.16b\n\t"
         /* Done GHASH */
         "\n"
-    "L_aes_gcm_decrypt_update_arm64_crypto_eor3_256_done_%=: \n\t"
+    "L_aes_gcm_decrypt_update_arm64_crypto_eor3_256_done_%=:\n\t"
 #endif /* !NO_AES_256 */
         "b	L_aes_gcm_decrypt_update_arm64_crypto_eor3_done_%=\n\t"
         /* AES_GCM_128 */
         "\n"
-    "L_aes_gcm_decrypt_update_arm64_crypto_eor3_start_128_%=: \n\t"
+    "L_aes_gcm_decrypt_update_arm64_crypto_eor3_start_128_%=:\n\t"
 #ifndef NO_AES_128
         "cmp	w8, #32\n\t"
         "b.lt	L_aes_gcm_decrypt_update_arm64_crypto_eor3_128_start_4_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_update_arm64_crypto_eor3_128_start_8_%=: \n\t"
+    "L_aes_gcm_decrypt_update_arm64_crypto_eor3_128_start_8_%=:\n\t"
         "ldr	q12, [%x[key]]\n\t"
         "add	w17, w9, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
@@ -40423,7 +40429,7 @@ void AES_GCM_decrypt_update_AARCH64_EOR3(const byte* key, int nr, byte* out,
         "cmp	w8, #8\n\t"
         "b.lt	L_aes_gcm_decrypt_update_arm64_crypto_eor3_128_end_8_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_update_arm64_crypto_eor3_128_both_8_%=: \n\t"
+    "L_aes_gcm_decrypt_update_arm64_crypto_eor3_128_both_8_%=:\n\t"
         "ldr	q12, [%x[key]]\n\t"
         "add	w17, w9, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
@@ -40737,7 +40743,7 @@ void AES_GCM_decrypt_update_AARCH64_EOR3(const byte* key, int nr, byte* out,
         "cmp	w8, #8\n\t"
         "b.ge	L_aes_gcm_decrypt_update_arm64_crypto_eor3_128_both_8_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_update_arm64_crypto_eor3_128_end_8_%=: \n\t"
+    "L_aes_gcm_decrypt_update_arm64_crypto_eor3_128_end_8_%=:\n\t"
         "rbit	v18.16b, v18.16b\n\t"
         "rbit	v19.16b, v19.16b\n\t"
         "rbit	v20.16b, v20.16b\n\t"
@@ -40826,7 +40832,7 @@ void AES_GCM_decrypt_update_AARCH64_EOR3(const byte* key, int nr, byte* out,
         "eor	v26.16b, v28.16b, v30.16b\n\t"
         /* Done GHASH */
         "\n"
-    "L_aes_gcm_decrypt_update_arm64_crypto_eor3_128_start_4_%=: \n\t"
+    "L_aes_gcm_decrypt_update_arm64_crypto_eor3_128_start_4_%=:\n\t"
         "ld1	{v0.2d, v1.2d, v2.2d, v3.2d}, [%x[key]], #0x40\n\t"
         "ld1	{v4.2d, v5.2d, v6.2d, v7.2d}, [%x[key]], #0x40\n\t"
         "ld1	{v8.2d, v9.2d}, [%x[key]], #32\n\t"
@@ -40945,7 +40951,7 @@ void AES_GCM_decrypt_update_AARCH64_EOR3(const byte* key, int nr, byte* out,
         "st1	{v14.16b, v15.16b, v16.16b, v17.16b}, [%x[out]], #0x40\n\t"
         "b.lt	L_aes_gcm_decrypt_update_arm64_crypto_eor3_128_end_4_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_update_arm64_crypto_eor3_128_both_4_%=: \n\t"
+    "L_aes_gcm_decrypt_update_arm64_crypto_eor3_128_both_4_%=:\n\t"
         "add	w13, w9, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
         "add	w12, w9, #2\n\t"
@@ -41099,7 +41105,7 @@ void AES_GCM_decrypt_update_AARCH64_EOR3(const byte* key, int nr, byte* out,
         "st1	{v14.16b, v15.16b, v16.16b, v17.16b}, [%x[out]], #0x40\n\t"
         "b.ge	L_aes_gcm_decrypt_update_arm64_crypto_eor3_128_both_4_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_update_arm64_crypto_eor3_128_end_4_%=: \n\t"
+    "L_aes_gcm_decrypt_update_arm64_crypto_eor3_128_end_4_%=:\n\t"
         "rbit	v18.16b, v18.16b\n\t"
         "rbit	v19.16b, v19.16b\n\t"
         "rbit	v20.16b, v20.16b\n\t"
@@ -41151,7 +41157,7 @@ void AES_GCM_decrypt_update_AARCH64_EOR3(const byte* key, int nr, byte* out,
         "b.eq	L_aes_gcm_decrypt_update_arm64_crypto_eor3_128_start_1_%=\n\t"
         "b.lt	L_aes_gcm_decrypt_update_arm64_crypto_eor3_128_done_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_update_arm64_crypto_eor3_128_start_2_%=: \n\t"
+    "L_aes_gcm_decrypt_update_arm64_crypto_eor3_128_start_2_%=:\n\t"
         "add	w13, w9, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
         "add	w9, w9, #2\n\t"
@@ -41235,7 +41241,7 @@ void AES_GCM_decrypt_update_AARCH64_EOR3(const byte* key, int nr, byte* out,
         /* Done GHASH */
         "cbz	w8, L_aes_gcm_decrypt_update_arm64_crypto_eor3_128_done_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_update_arm64_crypto_eor3_128_start_1_%=: \n\t"
+    "L_aes_gcm_decrypt_update_arm64_crypto_eor3_128_start_1_%=:\n\t"
         "add	w9, w9, #1\n\t"
         "mov	v14.16b, v13.16b\n\t"
         "rev	w10, w9\n\t"
@@ -41281,10 +41287,10 @@ void AES_GCM_decrypt_update_AARCH64_EOR3(const byte* key, int nr, byte* out,
         "eor	v26.16b, v28.16b, v30.16b\n\t"
         /* Done GHASH */
         "\n"
-    "L_aes_gcm_decrypt_update_arm64_crypto_eor3_128_done_%=: \n\t"
+    "L_aes_gcm_decrypt_update_arm64_crypto_eor3_128_done_%=:\n\t"
 #endif /* !NO_AES_128 */
         "\n"
-    "L_aes_gcm_decrypt_update_arm64_crypto_eor3_done_%=: \n\t"
+    "L_aes_gcm_decrypt_update_arm64_crypto_eor3_done_%=:\n\t"
         "rev	w9, w9\n\t"
         "mov	v13.s[3], w9\n\t"
         "st1	{v26.2d}, [%x[tag]]\n\t"
@@ -41340,7 +41346,7 @@ void AES_GCM_decrypt_final_AARCH64_EOR3(byte* tag, const byte* authTag,
         "ld1	{v0.16b}, [%x[authTag]]\n\t"
         "b	L_aes_gcm_decrypt_final_arm64_crypto_eor3_tag_loaded_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_final_arm64_crypto_eor3_part_tag_%=: \n\t"
+    "L_aes_gcm_decrypt_final_arm64_crypto_eor3_part_tag_%=:\n\t"
         "eor	v0.16b, v0.16b, v0.16b\n\t"
         "mov	x10, %x[tbytes]\n\t"
         "st1	{v0.2d}, [%x[tag]]\n\t"
@@ -41350,28 +41356,28 @@ void AES_GCM_decrypt_final_AARCH64_EOR3(byte* tag, const byte* authTag,
         "sub	x10, x10, #8\n\t"
         "str	x9, [%x[tag]], #8\n\t"
         "\n"
-    "L_aes_gcm_decrypt_final_arm64_crypto_eor3_tag_start_dw_%=: \n\t"
+    "L_aes_gcm_decrypt_final_arm64_crypto_eor3_tag_start_dw_%=:\n\t"
         "cmp	x10, #4\n\t"
         "b.lt	L_aes_gcm_decrypt_final_arm64_crypto_eor3_tag_start_sw_%=\n\t"
         "ldr	w9, [%x[authTag]], #4\n\t"
         "sub	x10, x10, #4\n\t"
         "str	w9, [%x[tag]], #4\n\t"
         "\n"
-    "L_aes_gcm_decrypt_final_arm64_crypto_eor3_tag_start_sw_%=: \n\t"
+    "L_aes_gcm_decrypt_final_arm64_crypto_eor3_tag_start_sw_%=:\n\t"
         "cmp	x10, #2\n\t"
         "b.lt	L_aes_gcm_decrypt_final_arm64_crypto_eor3_tag_start_byte_%=\n\t"
         "ldrh	w9, [%x[authTag]], #2\n\t"
         "sub	x10, x10, #2\n\t"
         "strh	w9, [%x[tag]], #2\n\t"
         "\n"
-    "L_aes_gcm_decrypt_final_arm64_crypto_eor3_tag_start_byte_%=: \n\t"
+    "L_aes_gcm_decrypt_final_arm64_crypto_eor3_tag_start_byte_%=:\n\t"
         "cbz	x10, L_aes_gcm_decrypt_final_arm64_crypto_eor3_tag_end_bytes_%=\n\t"
         "ldrb	w9, [%x[authTag]], #1\n\t"
         "subs	x10, x10, #1\n\t"
         "strb	w9, [%x[tag]], #1\n\t"
         "b.ne	L_aes_gcm_decrypt_final_arm64_crypto_eor3_tag_start_byte_%=\n\t"
         "\n"
-    "L_aes_gcm_decrypt_final_arm64_crypto_eor3_tag_end_bytes_%=: \n\t"
+    "L_aes_gcm_decrypt_final_arm64_crypto_eor3_tag_end_bytes_%=:\n\t"
         "sub	%x[tag], %x[tag], %x[tbytes]\n\t"
         "ld1	{v0.2d}, [%x[tag]]\n\t"
         "mov	x10, #16\n\t"
@@ -41379,14 +41385,14 @@ void AES_GCM_decrypt_final_AARCH64_EOR3(byte* tag, const byte* authTag,
         "sub	x10, x10, %x[tbytes]\n\t"
         "add	%x[tag], %x[tag], %x[tbytes]\n\t"
         "\n"
-    "L_aes_gcm_decrypt_final_arm64_crypto_eor3_calc_tag_byte_%=: \n\t"
+    "L_aes_gcm_decrypt_final_arm64_crypto_eor3_calc_tag_byte_%=:\n\t"
         "strb	wzr, [%x[tag]], #1\n\t"
         "subs	x10, x10, #1\n\t"
         "b.ne	L_aes_gcm_decrypt_final_arm64_crypto_eor3_calc_tag_byte_%=\n\t"
         "subs	%x[tag], %x[tag], #16\n\t"
         "ld1	{v5.2d}, [%x[tag]]\n\t"
         "\n"
-    "L_aes_gcm_decrypt_final_arm64_crypto_eor3_tag_loaded_%=: \n\t"
+    "L_aes_gcm_decrypt_final_arm64_crypto_eor3_tag_loaded_%=:\n\t"
         "eor	v0.16b, v0.16b, v5.16b\n\t"
         "mov	x9, v0.d[0]\n\t"
         "mov	x10, v0.d[1]\n\t"
@@ -41472,7 +41478,7 @@ void AES_XTS_encrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "cmp	w8, #4\n\t"
         "b.lt	L_aes_xts_encrypt_arm64_crypto_192_start_2_%=\n\t"
         "\n"
-    "L_aes_xts_encrypt_arm64_crypto_192_start_4_%=: \n\t"
+    "L_aes_xts_encrypt_arm64_crypto_192_start_4_%=:\n\t"
         "ld1	{v0.16b, v1.16b, v2.16b, v3.16b}, [%x[in]], #0x40\n\t"
         "mov	v5.d[0], x12\n\t"
         "mov	v5.d[1], x13\n\t"
@@ -41603,7 +41609,7 @@ void AES_XTS_encrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "cmp	w8, #4\n\t"
         "b.ge	L_aes_xts_encrypt_arm64_crypto_192_start_4_%=\n\t"
         "\n"
-    "L_aes_xts_encrypt_arm64_crypto_192_start_2_%=: \n\t"
+    "L_aes_xts_encrypt_arm64_crypto_192_start_2_%=:\n\t"
         "cmp	w8, #2\n\t"
         "b.lt	L_aes_xts_encrypt_arm64_crypto_192_start_1_%=\n\t"
         "ld1	{v0.16b, v1.16b}, [%x[in]], #32\n\t"
@@ -41672,7 +41678,7 @@ void AES_XTS_encrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "sub	w8, w8, #2\n\t"
         "st1	{v0.16b, v1.16b}, [%x[out]], #32\n\t"
         "\n"
-    "L_aes_xts_encrypt_arm64_crypto_192_start_1_%=: \n\t"
+    "L_aes_xts_encrypt_arm64_crypto_192_start_1_%=:\n\t"
         "cbz	w8, L_aes_xts_encrypt_arm64_crypto_192_done_%=\n\t"
         "ld1	{v0.16b}, [%x[in]], #16\n\t"
         "eor	v0.16b, v0.16b, v4.16b\n\t"
@@ -41708,14 +41714,14 @@ void AES_XTS_encrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "mov	v4.d[1], x11\n\t"
         "st1	{v0.16b}, [%x[out]], #16\n\t"
         "\n"
-    "L_aes_xts_encrypt_arm64_crypto_192_done_%=: \n\t"
+    "L_aes_xts_encrypt_arm64_crypto_192_done_%=:\n\t"
         "cbz	%w[sz], L_aes_xts_encrypt_arm64_crypto_192_partial_done_%=\n\t"
         "sub	%x[out], %x[out], #16\n\t"
         "ld1	{v0.16b}, [%x[out]], #16\n\t"
         "st1	{v0.2d}, [%x[tmp]]\n\t"
         "mov	w9, %w[sz]\n\t"
         "\n"
-    "L_aes_xts_encrypt_arm64_crypto_192_start_byte_%=: \n\t"
+    "L_aes_xts_encrypt_arm64_crypto_192_start_byte_%=:\n\t"
         "ldrb	w12, [%x[tmp]]\n\t"
         "ldrb	w13, [%x[in]], #1\n\t"
         "strb	w12, [%x[out]], #1\n\t"
@@ -41754,12 +41760,12 @@ void AES_XTS_encrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "eor	v0.16b, v0.16b, v4.16b\n\t"
         "st1	{v0.16b}, [%x[out]]\n\t"
         "\n"
-    "L_aes_xts_encrypt_arm64_crypto_192_partial_done_%=: \n\t"
+    "L_aes_xts_encrypt_arm64_crypto_192_partial_done_%=:\n\t"
 #endif /* !NO_AES_192 */
         "b	L_aes_xts_encrypt_arm64_crypto_done_%=\n\t"
         /* AES_XTS_256 */
         "\n"
-    "L_aes_xts_encrypt_arm64_crypto_start_256_%=: \n\t"
+    "L_aes_xts_encrypt_arm64_crypto_start_256_%=:\n\t"
 #ifndef NO_AES_256
         "ld1	{v24.2d, v25.2d, v26.2d, v27.2d}, [%x[key2]], #0x40\n\t"
         "ld1	{v28.2d, v29.2d}, [%x[key2]], #32\n\t"
@@ -41811,7 +41817,7 @@ void AES_XTS_encrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "cmp	w8, #4\n\t"
         "b.lt	L_aes_xts_encrypt_arm64_crypto_256_start_2_%=\n\t"
         "\n"
-    "L_aes_xts_encrypt_arm64_crypto_256_start_4_%=: \n\t"
+    "L_aes_xts_encrypt_arm64_crypto_256_start_4_%=:\n\t"
         "ld1	{v0.16b, v1.16b, v2.16b, v3.16b}, [%x[in]], #0x40\n\t"
         "mov	v5.d[0], x12\n\t"
         "mov	v5.d[1], x13\n\t"
@@ -41958,7 +41964,7 @@ void AES_XTS_encrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "cmp	w8, #4\n\t"
         "b.ge	L_aes_xts_encrypt_arm64_crypto_256_start_4_%=\n\t"
         "\n"
-    "L_aes_xts_encrypt_arm64_crypto_256_start_2_%=: \n\t"
+    "L_aes_xts_encrypt_arm64_crypto_256_start_2_%=:\n\t"
         "cmp	w8, #2\n\t"
         "b.lt	L_aes_xts_encrypt_arm64_crypto_256_start_1_%=\n\t"
         "ld1	{v0.16b, v1.16b}, [%x[in]], #32\n\t"
@@ -42035,7 +42041,7 @@ void AES_XTS_encrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "sub	w8, w8, #2\n\t"
         "st1	{v0.16b, v1.16b}, [%x[out]], #32\n\t"
         "\n"
-    "L_aes_xts_encrypt_arm64_crypto_256_start_1_%=: \n\t"
+    "L_aes_xts_encrypt_arm64_crypto_256_start_1_%=:\n\t"
         "cbz	w8, L_aes_xts_encrypt_arm64_crypto_256_done_%=\n\t"
         "ld1	{v0.16b}, [%x[in]], #16\n\t"
         "eor	v0.16b, v0.16b, v4.16b\n\t"
@@ -42075,14 +42081,14 @@ void AES_XTS_encrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "mov	v4.d[1], x11\n\t"
         "st1	{v0.16b}, [%x[out]], #16\n\t"
         "\n"
-    "L_aes_xts_encrypt_arm64_crypto_256_done_%=: \n\t"
+    "L_aes_xts_encrypt_arm64_crypto_256_done_%=:\n\t"
         "cbz	%w[sz], L_aes_xts_encrypt_arm64_crypto_256_partial_done_%=\n\t"
         "sub	%x[out], %x[out], #16\n\t"
         "ld1	{v0.16b}, [%x[out]], #16\n\t"
         "st1	{v0.2d}, [%x[tmp]]\n\t"
         "mov	w9, %w[sz]\n\t"
         "\n"
-    "L_aes_xts_encrypt_arm64_crypto_256_start_byte_%=: \n\t"
+    "L_aes_xts_encrypt_arm64_crypto_256_start_byte_%=:\n\t"
         "ldrb	w12, [%x[tmp]]\n\t"
         "ldrb	w13, [%x[in]], #1\n\t"
         "strb	w12, [%x[out]], #1\n\t"
@@ -42125,12 +42131,12 @@ void AES_XTS_encrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "eor	v0.16b, v0.16b, v4.16b\n\t"
         "st1	{v0.16b}, [%x[out]]\n\t"
         "\n"
-    "L_aes_xts_encrypt_arm64_crypto_256_partial_done_%=: \n\t"
+    "L_aes_xts_encrypt_arm64_crypto_256_partial_done_%=:\n\t"
 #endif /* !NO_AES_256 */
         "b	L_aes_xts_encrypt_arm64_crypto_done_%=\n\t"
         /* AES_XTS_128 */
         "\n"
-    "L_aes_xts_encrypt_arm64_crypto_start_128_%=: \n\t"
+    "L_aes_xts_encrypt_arm64_crypto_start_128_%=:\n\t"
 #ifndef NO_AES_128
         "ld1	{v24.2d, v25.2d}, [%x[key2]], #32\n\t"
         "ld1	{v26.2d}, [%x[key2]]\n\t"
@@ -42172,7 +42178,7 @@ void AES_XTS_encrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "cmp	w8, #4\n\t"
         "b.lt	L_aes_xts_encrypt_arm64_crypto_128_start_2_%=\n\t"
         "\n"
-    "L_aes_xts_encrypt_arm64_crypto_128_start_4_%=: \n\t"
+    "L_aes_xts_encrypt_arm64_crypto_128_start_4_%=:\n\t"
         "ld1	{v0.16b, v1.16b, v2.16b, v3.16b}, [%x[in]], #0x40\n\t"
         "mov	v5.d[0], x12\n\t"
         "mov	v5.d[1], x13\n\t"
@@ -42287,7 +42293,7 @@ void AES_XTS_encrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "cmp	w8, #4\n\t"
         "b.ge	L_aes_xts_encrypt_arm64_crypto_128_start_4_%=\n\t"
         "\n"
-    "L_aes_xts_encrypt_arm64_crypto_128_start_2_%=: \n\t"
+    "L_aes_xts_encrypt_arm64_crypto_128_start_2_%=:\n\t"
         "cmp	w8, #2\n\t"
         "b.lt	L_aes_xts_encrypt_arm64_crypto_128_start_1_%=\n\t"
         "ld1	{v0.16b, v1.16b}, [%x[in]], #32\n\t"
@@ -42348,7 +42354,7 @@ void AES_XTS_encrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "sub	w8, w8, #2\n\t"
         "st1	{v0.16b, v1.16b}, [%x[out]], #32\n\t"
         "\n"
-    "L_aes_xts_encrypt_arm64_crypto_128_start_1_%=: \n\t"
+    "L_aes_xts_encrypt_arm64_crypto_128_start_1_%=:\n\t"
         "cbz	w8, L_aes_xts_encrypt_arm64_crypto_128_done_%=\n\t"
         "ld1	{v0.16b}, [%x[in]], #16\n\t"
         "eor	v0.16b, v0.16b, v4.16b\n\t"
@@ -42380,14 +42386,14 @@ void AES_XTS_encrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "mov	v4.d[1], x11\n\t"
         "st1	{v0.16b}, [%x[out]], #16\n\t"
         "\n"
-    "L_aes_xts_encrypt_arm64_crypto_128_done_%=: \n\t"
+    "L_aes_xts_encrypt_arm64_crypto_128_done_%=:\n\t"
         "cbz	%w[sz], L_aes_xts_encrypt_arm64_crypto_128_partial_done_%=\n\t"
         "sub	%x[out], %x[out], #16\n\t"
         "ld1	{v0.16b}, [%x[out]], #16\n\t"
         "st1	{v0.2d}, [%x[tmp]]\n\t"
         "mov	w9, %w[sz]\n\t"
         "\n"
-    "L_aes_xts_encrypt_arm64_crypto_128_start_byte_%=: \n\t"
+    "L_aes_xts_encrypt_arm64_crypto_128_start_byte_%=:\n\t"
         "ldrb	w12, [%x[tmp]]\n\t"
         "ldrb	w13, [%x[in]], #1\n\t"
         "strb	w12, [%x[out]], #1\n\t"
@@ -42422,10 +42428,10 @@ void AES_XTS_encrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "eor	v0.16b, v0.16b, v4.16b\n\t"
         "st1	{v0.16b}, [%x[out]]\n\t"
         "\n"
-    "L_aes_xts_encrypt_arm64_crypto_128_partial_done_%=: \n\t"
+    "L_aes_xts_encrypt_arm64_crypto_128_partial_done_%=:\n\t"
 #endif /* !NO_AES_128 */
         "\n"
-    "L_aes_xts_encrypt_arm64_crypto_done_%=: \n\t"
+    "L_aes_xts_encrypt_arm64_crypto_done_%=:\n\t"
         "ldp	x29, x30, [sp], #32\n\t"
         : [out] "+r" (out), [sz] "+r" (sz), [key] "+r" (key),
           [key2] "+r" (key2), [tmp] "+r" (tmp), [nr] "+r" (nr)
@@ -42501,7 +42507,7 @@ void AES_XTS_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "cmp	w8, #4\n\t"
         "b.lt	L_aes_xts_decrypt_arm64_crypto_192_start_2_%=\n\t"
         "\n"
-    "L_aes_xts_decrypt_arm64_crypto_192_start_4_%=: \n\t"
+    "L_aes_xts_decrypt_arm64_crypto_192_start_4_%=:\n\t"
         "ld1	{v0.16b, v1.16b, v2.16b, v3.16b}, [%x[in]], #0x40\n\t"
         "mov	v5.d[0], x12\n\t"
         "mov	v5.d[1], x13\n\t"
@@ -42632,7 +42638,7 @@ void AES_XTS_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "cmp	w8, #4\n\t"
         "b.ge	L_aes_xts_decrypt_arm64_crypto_192_start_4_%=\n\t"
         "\n"
-    "L_aes_xts_decrypt_arm64_crypto_192_start_2_%=: \n\t"
+    "L_aes_xts_decrypt_arm64_crypto_192_start_2_%=:\n\t"
         "cmp	w8, #2\n\t"
         "b.lt	L_aes_xts_decrypt_arm64_crypto_192_start_1_%=\n\t"
         "ld1	{v0.16b, v1.16b}, [%x[in]], #32\n\t"
@@ -42701,7 +42707,7 @@ void AES_XTS_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "sub	w8, w8, #2\n\t"
         "st1	{v0.16b, v1.16b}, [%x[out]], #32\n\t"
         "\n"
-    "L_aes_xts_decrypt_arm64_crypto_192_start_1_%=: \n\t"
+    "L_aes_xts_decrypt_arm64_crypto_192_start_1_%=:\n\t"
         "cbz	w8, L_aes_xts_decrypt_arm64_crypto_192_done_%=\n\t"
         "ld1	{v0.16b}, [%x[in]], #16\n\t"
         "eor	v0.16b, v0.16b, v4.16b\n\t"
@@ -42737,7 +42743,7 @@ void AES_XTS_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "mov	v4.d[1], x11\n\t"
         "st1	{v0.16b}, [%x[out]], #16\n\t"
         "\n"
-    "L_aes_xts_decrypt_arm64_crypto_192_done_%=: \n\t"
+    "L_aes_xts_decrypt_arm64_crypto_192_done_%=:\n\t"
         "cbz	%w[sz], L_aes_xts_decrypt_arm64_crypto_192_partial_done_%=\n\t"
         "and	x9, x19, x11, asr 63\n\t"
         "extr	x13, x11, x10, #63\n\t"
@@ -42775,7 +42781,7 @@ void AES_XTS_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "add	%x[out], %x[out], #16\n\t"
         "mov	w9, %w[sz]\n\t"
         "\n"
-    "L_aes_xts_decrypt_arm64_crypto_192_start_byte_%=: \n\t"
+    "L_aes_xts_decrypt_arm64_crypto_192_start_byte_%=:\n\t"
         "ldrb	w12, [%x[tmp]]\n\t"
         "ldrb	w13, [%x[in]], #1\n\t"
         "strb	w12, [%x[out]], #1\n\t"
@@ -42814,12 +42820,12 @@ void AES_XTS_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "eor	v0.16b, v0.16b, v4.16b\n\t"
         "st1	{v0.16b}, [%x[out]]\n\t"
         "\n"
-    "L_aes_xts_decrypt_arm64_crypto_192_partial_done_%=: \n\t"
+    "L_aes_xts_decrypt_arm64_crypto_192_partial_done_%=:\n\t"
 #endif /* !NO_AES_192 */
         "b	L_aes_xts_decrypt_arm64_crypto_done_%=\n\t"
         /* AES_XTS_256 */
         "\n"
-    "L_aes_xts_decrypt_arm64_crypto_start_256_%=: \n\t"
+    "L_aes_xts_decrypt_arm64_crypto_start_256_%=:\n\t"
 #ifndef NO_AES_256
         "ld1	{v24.2d, v25.2d, v26.2d, v27.2d}, [%x[key2]], #0x40\n\t"
         "ld1	{v28.2d, v29.2d}, [%x[key2]], #32\n\t"
@@ -42871,7 +42877,7 @@ void AES_XTS_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "cmp	w8, #4\n\t"
         "b.lt	L_aes_xts_decrypt_arm64_crypto_256_start_2_%=\n\t"
         "\n"
-    "L_aes_xts_decrypt_arm64_crypto_256_start_4_%=: \n\t"
+    "L_aes_xts_decrypt_arm64_crypto_256_start_4_%=:\n\t"
         "ld1	{v0.16b, v1.16b, v2.16b, v3.16b}, [%x[in]], #0x40\n\t"
         "mov	v5.d[0], x12\n\t"
         "mov	v5.d[1], x13\n\t"
@@ -43018,7 +43024,7 @@ void AES_XTS_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "cmp	w8, #4\n\t"
         "b.ge	L_aes_xts_decrypt_arm64_crypto_256_start_4_%=\n\t"
         "\n"
-    "L_aes_xts_decrypt_arm64_crypto_256_start_2_%=: \n\t"
+    "L_aes_xts_decrypt_arm64_crypto_256_start_2_%=:\n\t"
         "cmp	w8, #2\n\t"
         "b.lt	L_aes_xts_decrypt_arm64_crypto_256_start_1_%=\n\t"
         "ld1	{v0.16b, v1.16b}, [%x[in]], #32\n\t"
@@ -43095,7 +43101,7 @@ void AES_XTS_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "sub	w8, w8, #2\n\t"
         "st1	{v0.16b, v1.16b}, [%x[out]], #32\n\t"
         "\n"
-    "L_aes_xts_decrypt_arm64_crypto_256_start_1_%=: \n\t"
+    "L_aes_xts_decrypt_arm64_crypto_256_start_1_%=:\n\t"
         "cbz	w8, L_aes_xts_decrypt_arm64_crypto_256_done_%=\n\t"
         "ld1	{v0.16b}, [%x[in]], #16\n\t"
         "eor	v0.16b, v0.16b, v4.16b\n\t"
@@ -43135,7 +43141,7 @@ void AES_XTS_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "mov	v4.d[1], x11\n\t"
         "st1	{v0.16b}, [%x[out]], #16\n\t"
         "\n"
-    "L_aes_xts_decrypt_arm64_crypto_256_done_%=: \n\t"
+    "L_aes_xts_decrypt_arm64_crypto_256_done_%=:\n\t"
         "cbz	%w[sz], L_aes_xts_decrypt_arm64_crypto_256_partial_done_%=\n\t"
         "and	x9, x19, x11, asr 63\n\t"
         "extr	x13, x11, x10, #63\n\t"
@@ -43177,7 +43183,7 @@ void AES_XTS_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "add	%x[out], %x[out], #16\n\t"
         "mov	w9, %w[sz]\n\t"
         "\n"
-    "L_aes_xts_decrypt_arm64_crypto_256_start_byte_%=: \n\t"
+    "L_aes_xts_decrypt_arm64_crypto_256_start_byte_%=:\n\t"
         "ldrb	w12, [%x[tmp]]\n\t"
         "ldrb	w13, [%x[in]], #1\n\t"
         "strb	w12, [%x[out]], #1\n\t"
@@ -43220,12 +43226,12 @@ void AES_XTS_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "eor	v0.16b, v0.16b, v4.16b\n\t"
         "st1	{v0.16b}, [%x[out]]\n\t"
         "\n"
-    "L_aes_xts_decrypt_arm64_crypto_256_partial_done_%=: \n\t"
+    "L_aes_xts_decrypt_arm64_crypto_256_partial_done_%=:\n\t"
 #endif /* !NO_AES_256 */
         "b	L_aes_xts_decrypt_arm64_crypto_done_%=\n\t"
         /* AES_XTS_128 */
         "\n"
-    "L_aes_xts_decrypt_arm64_crypto_start_128_%=: \n\t"
+    "L_aes_xts_decrypt_arm64_crypto_start_128_%=:\n\t"
 #ifndef NO_AES_128
         "ld1	{v24.2d, v25.2d}, [%x[key2]], #32\n\t"
         "ld1	{v26.2d}, [%x[key2]]\n\t"
@@ -43267,7 +43273,7 @@ void AES_XTS_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "cmp	w8, #4\n\t"
         "b.lt	L_aes_xts_decrypt_arm64_crypto_128_start_2_%=\n\t"
         "\n"
-    "L_aes_xts_decrypt_arm64_crypto_128_start_4_%=: \n\t"
+    "L_aes_xts_decrypt_arm64_crypto_128_start_4_%=:\n\t"
         "ld1	{v0.16b, v1.16b, v2.16b, v3.16b}, [%x[in]], #0x40\n\t"
         "mov	v5.d[0], x12\n\t"
         "mov	v5.d[1], x13\n\t"
@@ -43382,7 +43388,7 @@ void AES_XTS_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "cmp	w8, #4\n\t"
         "b.ge	L_aes_xts_decrypt_arm64_crypto_128_start_4_%=\n\t"
         "\n"
-    "L_aes_xts_decrypt_arm64_crypto_128_start_2_%=: \n\t"
+    "L_aes_xts_decrypt_arm64_crypto_128_start_2_%=:\n\t"
         "cmp	w8, #2\n\t"
         "b.lt	L_aes_xts_decrypt_arm64_crypto_128_start_1_%=\n\t"
         "ld1	{v0.16b, v1.16b}, [%x[in]], #32\n\t"
@@ -43443,7 +43449,7 @@ void AES_XTS_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "sub	w8, w8, #2\n\t"
         "st1	{v0.16b, v1.16b}, [%x[out]], #32\n\t"
         "\n"
-    "L_aes_xts_decrypt_arm64_crypto_128_start_1_%=: \n\t"
+    "L_aes_xts_decrypt_arm64_crypto_128_start_1_%=:\n\t"
         "cbz	w8, L_aes_xts_decrypt_arm64_crypto_128_done_%=\n\t"
         "ld1	{v0.16b}, [%x[in]], #16\n\t"
         "eor	v0.16b, v0.16b, v4.16b\n\t"
@@ -43475,7 +43481,7 @@ void AES_XTS_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "mov	v4.d[1], x11\n\t"
         "st1	{v0.16b}, [%x[out]], #16\n\t"
         "\n"
-    "L_aes_xts_decrypt_arm64_crypto_128_done_%=: \n\t"
+    "L_aes_xts_decrypt_arm64_crypto_128_done_%=:\n\t"
         "cbz	%w[sz], L_aes_xts_decrypt_arm64_crypto_128_partial_done_%=\n\t"
         "and	x9, x19, x11, asr 63\n\t"
         "extr	x13, x11, x10, #63\n\t"
@@ -43509,7 +43515,7 @@ void AES_XTS_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "add	%x[out], %x[out], #16\n\t"
         "mov	w9, %w[sz]\n\t"
         "\n"
-    "L_aes_xts_decrypt_arm64_crypto_128_start_byte_%=: \n\t"
+    "L_aes_xts_decrypt_arm64_crypto_128_start_byte_%=:\n\t"
         "ldrb	w12, [%x[tmp]]\n\t"
         "ldrb	w13, [%x[in]], #1\n\t"
         "strb	w12, [%x[out]], #1\n\t"
@@ -43544,10 +43550,10 @@ void AES_XTS_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
         "eor	v0.16b, v0.16b, v4.16b\n\t"
         "st1	{v0.16b}, [%x[out]]\n\t"
         "\n"
-    "L_aes_xts_decrypt_arm64_crypto_128_partial_done_%=: \n\t"
+    "L_aes_xts_decrypt_arm64_crypto_128_partial_done_%=:\n\t"
 #endif /* !NO_AES_128 */
         "\n"
-    "L_aes_xts_decrypt_arm64_crypto_done_%=: \n\t"
+    "L_aes_xts_decrypt_arm64_crypto_done_%=:\n\t"
         "ldp	x29, x30, [sp], #32\n\t"
         : [out] "+r" (out), [sz] "+r" (sz), [key] "+r" (key),
           [key2] "+r" (key2), [tmp] "+r" (tmp), [nr] "+r" (nr)
@@ -43566,7 +43572,7 @@ void AES_XTS_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
 #if defined(HAVE_AES_DECRYPT) || defined(HAVE_AES_CBC) || \
     defined(HAVE_AESCCM) || defined(HAVE_AESGCM) || \
     defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER)
-static const word8 L_AES_ARM64_NEON_te[] = {
+XALIGNED(4) static const word8 L_AES_ARM64_NEON_te[] = {
     0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5,
     0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
     0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0,
@@ -43601,7 +43607,7 @@ static const word8 L_AES_ARM64_NEON_te[] = {
     0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16,
 };
 
-static const word8 L_AES_ARM64_NEON_shift_rows_shuffle[] = {
+XALIGNED(4) static const word8 L_AES_ARM64_NEON_shift_rows_shuffle[] = {
     0x0c, 0x09, 0x06, 0x03, 0x00, 0x0d, 0x0a, 0x07,
     0x04, 0x01, 0x0e, 0x0b, 0x08, 0x05, 0x02, 0x0f,
 };
@@ -43617,7 +43623,7 @@ void AES_invert_key_NEON(unsigned char* ks, word32 rounds)
         "mov	x2, %x[ks]\n\t"
         "mov	w4, %w[rounds]\n\t"
         "\n"
-    "L_AES_invert_key_NEON_loop_%=: \n\t"
+    "L_AES_invert_key_NEON_loop_%=:\n\t"
         "ld1	{v0.2d}, [x2]\n\t"
         "ld1	{v1.2d}, [x3]\n\t"
         "st1	{v0.2d}, [x3]\n\t"
@@ -43629,7 +43635,7 @@ void AES_invert_key_NEON(unsigned char* ks, word32 rounds)
         "add	x2, %x[ks], #16\n\t"
         "sub	w4, %w[rounds], #1\n\t"
         "\n"
-    "L_AES_invert_key_NEON_mix_loop_%=: \n\t"
+    "L_AES_invert_key_NEON_mix_loop_%=:\n\t"
         "ld1	{v0.2d}, [x2]\n\t"
         "sshr	v5.16b, v0.16b, #7\n\t"
         "ushr	v6.16b, v0.16b, #6\n\t"
@@ -43667,10 +43673,10 @@ void AES_invert_key_NEON(unsigned char* ks, word32 rounds)
 }
 
 #endif /* HAVE_AES_DECRYPT */
-static const word32 L_AES_ARM64_NEON_rcon[] = {
+XALIGNED(8) static const word32 L_AES_ARM64_NEON_rcon[] = {
     0x01000000, 0x02000000, 0x04000000, 0x08000000,
     0x10000000, 0x20000000, 0x40000000, 0x80000000,
-    0x1b000000, 0x36000000,
+    0x1b000000, 0x36000000
 };
 
 void AES_set_encrypt_key_NEON(const unsigned char* key, word32 len,
@@ -43702,7 +43708,7 @@ void AES_set_encrypt_key_NEON(const unsigned char* key, word32 len,
         "st1	{v1.2d}, [%x[ks]], #16\n\t"
         "mov	x3, #6\n\t"
         "\n"
-    "L_AES_set_encrypt_key_NEON_loop_256_%=: \n\t"
+    "L_AES_set_encrypt_key_NEON_loop_256_%=:\n\t"
         "eor	v22.16b, v1.16b, v2.16b\n\t"
         "eor	v23.16b, v1.16b, v3.16b\n\t"
         "eor	v24.16b, v1.16b, v4.16b\n\t"
@@ -43781,7 +43787,7 @@ void AES_set_encrypt_key_NEON(const unsigned char* key, word32 len,
         "st1	{v0.2d}, [%x[ks]], #16\n\t"
         "b	L_AES_set_encrypt_key_NEON_end_%=\n\t"
         "\n"
-    "L_AES_set_encrypt_key_NEON_start_192_%=: \n\t"
+    "L_AES_set_encrypt_key_NEON_start_192_%=:\n\t"
         "ld1	{v0.16b}, [%x[key]], #16\n\t"
         "ld1	{v1.8b}, [%x[key]]\n\t"
         "rev32	v0.16b, v0.16b\n\t"
@@ -43791,7 +43797,7 @@ void AES_set_encrypt_key_NEON(const unsigned char* key, word32 len,
         "ext	v1.16b, v1.16b, v1.16b, #8\n\t"
         "mov	x3, #7\n\t"
         "\n"
-    "L_AES_set_encrypt_key_NEON_loop_192_%=: \n\t"
+    "L_AES_set_encrypt_key_NEON_loop_192_%=:\n\t"
         "eor	v22.16b, v1.16b, v2.16b\n\t"
         "eor	v23.16b, v1.16b, v3.16b\n\t"
         "eor	v24.16b, v1.16b, v4.16b\n\t"
@@ -43855,13 +43861,13 @@ void AES_set_encrypt_key_NEON(const unsigned char* key, word32 len,
         "st1	{v0.2d}, [%x[ks]], #16\n\t"
         "b	L_AES_set_encrypt_key_NEON_end_%=\n\t"
         "\n"
-    "L_AES_set_encrypt_key_NEON_start_128_%=: \n\t"
+    "L_AES_set_encrypt_key_NEON_start_128_%=:\n\t"
         "ld1	{v0.16b}, [%x[key]]\n\t"
         "rev32	v0.16b, v0.16b\n\t"
         "st1	{v0.2d}, [%x[ks]], #16\n\t"
         "mov	x3, #10\n\t"
         "\n"
-    "L_AES_set_encrypt_key_NEON_loop_128_%=: \n\t"
+    "L_AES_set_encrypt_key_NEON_loop_128_%=:\n\t"
         "eor	v22.16b, v0.16b, v2.16b\n\t"
         "eor	v23.16b, v0.16b, v3.16b\n\t"
         "eor	v24.16b, v0.16b, v4.16b\n\t"
@@ -43891,7 +43897,7 @@ void AES_set_encrypt_key_NEON(const unsigned char* key, word32 len,
         "subs	x3, x3, #1\n\t"
         "b.ne	L_AES_set_encrypt_key_NEON_loop_128_%=\n\t"
         "\n"
-    "L_AES_set_encrypt_key_NEON_end_%=: \n\t"
+    "L_AES_set_encrypt_key_NEON_end_%=:\n\t"
         : [len] "+r" (len), [ks] "+r" (ks)
         : [key] "r" (key), [rcon] "r" (rcon), [te] "r" (te)
         : "memory", "cc", "x3", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
@@ -43918,7 +43924,7 @@ void AES_ECB_encrypt_NEON(const unsigned char* in, unsigned char* out,
         "cmp	%x[len], #0x40\n\t"
         "b.lt	L_AES_ECB_encrypt_NEON_start_2_%=\n\t"
         "\n"
-    "L_AES_ECB_encrypt_NEON_loop_4_%=: \n\t"
+    "L_AES_ECB_encrypt_NEON_loop_4_%=:\n\t"
         "mov	x8, %x[ks]\n\t"
         "ld1	{v0.16b, v1.16b, v2.16b, v3.16b}, [%x[in]], #0x40\n\t"
         "ld1	{v4.2d}, [x8], #16\n\t"
@@ -43933,7 +43939,7 @@ void AES_ECB_encrypt_NEON(const unsigned char* in, unsigned char* out,
         "eor	v3.16b, v3.16b, v4.16b\n\t"
         "sub	w7, %w[nr], #2\n\t"
         "\n"
-    "L_AES_ECB_encrypt_NEON_loop_nr_4_%=: \n\t"
+    "L_AES_ECB_encrypt_NEON_loop_nr_4_%=:\n\t"
         "tbl	v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b\n\t"
         "tbl	v5.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v1.16b\n\t"
         "tbl	v6.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v2.16b\n\t"
@@ -44324,7 +44330,7 @@ void AES_ECB_encrypt_NEON(const unsigned char* in, unsigned char* out,
         "cmp	%x[len], #0x40\n\t"
         "b.ge	L_AES_ECB_encrypt_NEON_loop_4_%=\n\t"
         "\n"
-    "L_AES_ECB_encrypt_NEON_start_2_%=: \n\t"
+    "L_AES_ECB_encrypt_NEON_start_2_%=:\n\t"
         "movi	v12.16b, #0x40\n\t"
         "movi	v13.16b, #0x80\n\t"
         "movi	v14.16b, #0xc0\n\t"
@@ -44333,7 +44339,7 @@ void AES_ECB_encrypt_NEON(const unsigned char* in, unsigned char* out,
         "b.eq	L_AES_ECB_encrypt_NEON_start_1_%=\n\t"
         "b.lt	L_AES_ECB_encrypt_NEON_data_done_%=\n\t"
         "\n"
-    "L_AES_ECB_encrypt_NEON_loop_2_%=: \n\t"
+    "L_AES_ECB_encrypt_NEON_loop_2_%=:\n\t"
         "mov	x8, %x[ks]\n\t"
         "ld1	{v0.16b, v1.16b}, [%x[in]], #32\n\t"
         "ld1	{v4.2d}, [x8], #16\n\t"
@@ -44344,7 +44350,7 @@ void AES_ECB_encrypt_NEON(const unsigned char* in, unsigned char* out,
         "eor	v1.16b, v1.16b, v4.16b\n\t"
         "sub	w7, %w[nr], #2\n\t"
         "\n"
-    "L_AES_ECB_encrypt_NEON_loop_nr_2_%=: \n\t"
+    "L_AES_ECB_encrypt_NEON_loop_nr_2_%=:\n\t"
         "eor	v8.16b, v0.16b, v12.16b\n\t"
         "eor	v9.16b, v1.16b, v12.16b\n\t"
         "tbl	v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b\n\t"
@@ -44544,7 +44550,7 @@ void AES_ECB_encrypt_NEON(const unsigned char* in, unsigned char* out,
         "cmp	%x[len], #0\n\t"
         "b.eq	L_AES_ECB_encrypt_NEON_data_done_%=\n\t"
         "\n"
-    "L_AES_ECB_encrypt_NEON_start_1_%=: \n\t"
+    "L_AES_ECB_encrypt_NEON_start_1_%=:\n\t"
         "ld1	{v3.2d}, [%[shuffle]]\n\t"
         "mov	x8, %x[ks]\n\t"
         "ld1	{v0.16b}, [%x[in]], #16\n\t"
@@ -44554,7 +44560,7 @@ void AES_ECB_encrypt_NEON(const unsigned char* in, unsigned char* out,
         "eor	v0.16b, v0.16b, v4.16b\n\t"
         "sub	w7, %w[nr], #2\n\t"
         "\n"
-    "L_AES_ECB_encrypt_NEON_loop_nr_1_%=: \n\t"
+    "L_AES_ECB_encrypt_NEON_loop_nr_1_%=:\n\t"
         "eor	v8.16b, v0.16b, v12.16b\n\t"
         "eor	v9.16b, v0.16b, v13.16b\n\t"
         "eor	v10.16b, v0.16b, v14.16b\n\t"
@@ -44655,7 +44661,7 @@ void AES_ECB_encrypt_NEON(const unsigned char* in, unsigned char* out,
         "rev32	v0.16b, v0.16b\n\t"
         "st1	{v0.16b}, [%x[out]], #16\n\t"
         "\n"
-    "L_AES_ECB_encrypt_NEON_data_done_%=: \n\t"
+    "L_AES_ECB_encrypt_NEON_data_done_%=:\n\t"
         : [out] "+r" (out), [len] "+r" (len), [nr] "+r" (nr)
         : [in] "r" (in), [ks] "r" (ks), [te] "r" (te), [shuffle] "r" (shuffle)
         : "memory", "cc", "x7", "x8", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
@@ -44687,7 +44693,7 @@ void AES_CBC_encrypt_NEON(const unsigned char* in, unsigned char* out,
         "ld1	{v0.2d}, [%x[iv]]\n\t"
         "ld1	{v26.2d}, [%[shuffle]]\n\t"
         "\n"
-    "L_AES_CBC_encrypt_NEON_loop_block_%=: \n\t"
+    "L_AES_CBC_encrypt_NEON_loop_block_%=:\n\t"
         "add	x9, %x[ks], #16\n\t"
         "ld1	{v1.16b}, [%x[in]], #16\n\t"
         "ld1	{v2.16b}, [%x[ks]]\n\t"
@@ -44697,7 +44703,7 @@ void AES_CBC_encrypt_NEON(const unsigned char* in, unsigned char* out,
         "eor	v0.16b, v0.16b, v2.16b\n\t"
         "sub	w8, %w[nr], #2\n\t"
         "\n"
-    "L_AES_CBC_encrypt_NEON_loop_nr_%=: \n\t"
+    "L_AES_CBC_encrypt_NEON_loop_nr_%=:\n\t"
         "eor	v2.16b, v0.16b, v6.16b\n\t"
         "eor	v3.16b, v0.16b, v7.16b\n\t"
         "eor	v4.16b, v0.16b, v8.16b\n\t"
@@ -44831,7 +44837,7 @@ void AES_CTR_encrypt_NEON(const unsigned char* in, unsigned char* out,
         "cmp	%x[len], #0x40\n\t"
         "b.lt	L_AES_CTR_encrypt_NEON_start_2_%=\n\t"
         "\n"
-    "L_AES_CTR_encrypt_NEON_loop_4_%=: \n\t"
+    "L_AES_CTR_encrypt_NEON_loop_4_%=:\n\t"
         "mov	x9, %x[ks]\n\t"
         "ld1	{v4.2d}, [x9], #16\n\t"
         "mov	v8.d[1], x10\n\t"
@@ -44869,7 +44875,7 @@ void AES_CTR_encrypt_NEON(const unsigned char* in, unsigned char* out,
         "rev32	v8.16b, v8.16b\n\t"
         "sub	w8, %w[nr], #2\n\t"
         "\n"
-    "L_AES_CTR_encrypt_NEON_loop_nr_4_%=: \n\t"
+    "L_AES_CTR_encrypt_NEON_loop_nr_4_%=:\n\t"
         "tbl	v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b\n\t"
         "tbl	v5.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v1.16b\n\t"
         "tbl	v6.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v2.16b\n\t"
@@ -45269,7 +45275,7 @@ void AES_CTR_encrypt_NEON(const unsigned char* in, unsigned char* out,
         "rev64	v2.16b, v2.16b\n\t"
         "rev32	v2.16b, v2.16b\n\t"
         "\n"
-    "L_AES_CTR_encrypt_NEON_start_2_%=: \n\t"
+    "L_AES_CTR_encrypt_NEON_start_2_%=:\n\t"
         "movi	v12.16b, #0x40\n\t"
         "movi	v13.16b, #0x80\n\t"
         "movi	v14.16b, #0xc0\n\t"
@@ -45278,7 +45284,7 @@ void AES_CTR_encrypt_NEON(const unsigned char* in, unsigned char* out,
         "b.eq	L_AES_CTR_encrypt_NEON_start_1_%=\n\t"
         "b.lt	L_AES_CTR_encrypt_NEON_data_done_%=\n\t"
         "\n"
-    "L_AES_CTR_encrypt_NEON_loop_2_%=: \n\t"
+    "L_AES_CTR_encrypt_NEON_loop_2_%=:\n\t"
         "mov	x9, %x[ks]\n\t"
         "ld1	{v4.2d}, [x9], #16\n\t"
         /* Round: 0 - XOR in key schedule */
@@ -45298,7 +45304,7 @@ void AES_CTR_encrypt_NEON(const unsigned char* in, unsigned char* out,
         "rev32	v2.16b, v2.16b\n\t"
         "sub	w8, %w[nr], #2\n\t"
         "\n"
-    "L_AES_CTR_encrypt_NEON_loop_nr_2_%=: \n\t"
+    "L_AES_CTR_encrypt_NEON_loop_nr_2_%=:\n\t"
         "eor	v8.16b, v0.16b, v12.16b\n\t"
         "eor	v9.16b, v1.16b, v12.16b\n\t"
         "tbl	v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b\n\t"
@@ -45501,7 +45507,7 @@ void AES_CTR_encrypt_NEON(const unsigned char* in, unsigned char* out,
         "cmp	%x[len], #0\n\t"
         "b.eq	L_AES_CTR_encrypt_NEON_data_done_%=\n\t"
         "\n"
-    "L_AES_CTR_encrypt_NEON_start_1_%=: \n\t"
+    "L_AES_CTR_encrypt_NEON_start_1_%=:\n\t"
         "ld1	{v3.2d}, [%[shuffle]]\n\t"
         "mov	x9, %x[ks]\n\t"
         "ld1	{v4.2d}, [x9], #16\n\t"
@@ -45509,7 +45515,7 @@ void AES_CTR_encrypt_NEON(const unsigned char* in, unsigned char* out,
         "eor	v0.16b, v2.16b, v4.16b\n\t"
         "sub	w8, %w[nr], #2\n\t"
         "\n"
-    "L_AES_CTR_encrypt_NEON_loop_nr_1_%=: \n\t"
+    "L_AES_CTR_encrypt_NEON_loop_nr_1_%=:\n\t"
         "eor	v8.16b, v0.16b, v12.16b\n\t"
         "eor	v9.16b, v0.16b, v13.16b\n\t"
         "eor	v10.16b, v0.16b, v14.16b\n\t"
@@ -45618,7 +45624,7 @@ void AES_CTR_encrypt_NEON(const unsigned char* in, unsigned char* out,
         "rev64	v2.16b, v2.16b\n\t"
         "rev32	v2.16b, v2.16b\n\t"
         "\n"
-    "L_AES_CTR_encrypt_NEON_data_done_%=: \n\t"
+    "L_AES_CTR_encrypt_NEON_data_done_%=:\n\t"
         "rev32	v2.16b, v2.16b\n\t"
         "st1	{v2.2d}, [%x[ctr]]\n\t"
         : [out] "+r" (out), [len] "+r" (len), [nr] "+r" (nr), [ctr] "+r" (ctr)
@@ -45634,7 +45640,7 @@ void AES_CTR_encrypt_NEON(const unsigned char* in, unsigned char* out,
 #ifdef HAVE_AES_DECRYPT
 #if defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) || \
     defined(HAVE_AES_CBC) || defined(HAVE_AES_ECB)
-static const word8 L_AES_ARM64_NEON_td[] = {
+XALIGNED(4) static const word8 L_AES_ARM64_NEON_td[] = {
     0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38,
     0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
     0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87,
@@ -45669,12 +45675,13 @@ static const word8 L_AES_ARM64_NEON_td[] = {
     0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d,
 };
 
-static const word8 L_AES_ARM64_NEON_shift_rows_invshuffle[] = {
+XALIGNED(4) static const word8 L_AES_ARM64_NEON_shift_rows_invshuffle[] = {
     0x04, 0x09, 0x0e, 0x03, 0x08, 0x0d, 0x02, 0x07,
     0x0c, 0x01, 0x06, 0x0b, 0x00, 0x05, 0x0a, 0x0f,
 };
 
-#if defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) || defined(HAVE_AES_ECB)
+#if defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) || \
+        defined(HAVE_AES_ECB)
 void AES_ECB_decrypt_NEON(const unsigned char* in, unsigned char* out,
     unsigned long len, const unsigned char* ks, int nr);
 void AES_ECB_decrypt_NEON(const unsigned char* in, unsigned char* out,
@@ -45690,7 +45697,7 @@ void AES_ECB_decrypt_NEON(const unsigned char* in, unsigned char* out,
         "cmp	%x[len], #0x40\n\t"
         "b.lt	L_AES_ECB_decrypt_NEON_start_2_%=\n\t"
         "\n"
-    "L_AES_ECB_decrypt_NEON_loop_4_%=: \n\t"
+    "L_AES_ECB_decrypt_NEON_loop_4_%=:\n\t"
         "mov	x8, %x[ks]\n\t"
         "ld1	{v0.16b, v1.16b, v2.16b, v3.16b}, [%x[in]], #0x40\n\t"
         "ld1	{v4.2d}, [x8], #16\n\t"
@@ -45705,7 +45712,7 @@ void AES_ECB_decrypt_NEON(const unsigned char* in, unsigned char* out,
         "eor	v3.16b, v3.16b, v4.16b\n\t"
         "sub	w7, %w[nr], #2\n\t"
         "\n"
-    "L_AES_ECB_decrypt_NEON_loop_nr_4_%=: \n\t"
+    "L_AES_ECB_decrypt_NEON_loop_nr_4_%=:\n\t"
         "tbl	v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b\n\t"
         "tbl	v5.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v1.16b\n\t"
         "tbl	v6.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v2.16b\n\t"
@@ -46243,12 +46250,12 @@ void AES_ECB_decrypt_NEON(const unsigned char* in, unsigned char* out,
         "cmp	%x[len], #0x40\n\t"
         "b.ge	L_AES_ECB_decrypt_NEON_loop_4_%=\n\t"
         "\n"
-    "L_AES_ECB_decrypt_NEON_start_2_%=: \n\t"
+    "L_AES_ECB_decrypt_NEON_start_2_%=:\n\t"
         "cmp	%x[len], #16\n\t"
         "b.eq	L_AES_ECB_decrypt_NEON_start_1_%=\n\t"
         "b.lt	L_AES_ECB_decrypt_NEON_data_done_%=\n\t"
         "\n"
-    "L_AES_ECB_decrypt_NEON_loop_2_%=: \n\t"
+    "L_AES_ECB_decrypt_NEON_loop_2_%=:\n\t"
         "mov	x8, %x[ks]\n\t"
         "ld1	{v0.16b, v1.16b}, [%x[in]], #32\n\t"
         "ld1	{v4.2d}, [x8], #16\n\t"
@@ -46259,7 +46266,7 @@ void AES_ECB_decrypt_NEON(const unsigned char* in, unsigned char* out,
         "eor	v1.16b, v1.16b, v4.16b\n\t"
         "sub	w7, %w[nr], #2\n\t"
         "\n"
-    "L_AES_ECB_decrypt_NEON_loop_nr_2_%=: \n\t"
+    "L_AES_ECB_decrypt_NEON_loop_nr_2_%=:\n\t"
         "movi	v12.16b, #0x40\n\t"
         "movi	v13.16b, #0x80\n\t"
         "movi	v14.16b, #0xc0\n\t"
@@ -46546,7 +46553,7 @@ void AES_ECB_decrypt_NEON(const unsigned char* in, unsigned char* out,
         "cmp	%x[len], #0\n\t"
         "b.eq	L_AES_ECB_decrypt_NEON_data_done_%=\n\t"
         "\n"
-    "L_AES_ECB_decrypt_NEON_start_1_%=: \n\t"
+    "L_AES_ECB_decrypt_NEON_start_1_%=:\n\t"
         "movi	v12.16b, #0x40\n\t"
         "movi	v13.16b, #0x80\n\t"
         "movi	v14.16b, #0xc0\n\t"
@@ -46560,7 +46567,7 @@ void AES_ECB_decrypt_NEON(const unsigned char* in, unsigned char* out,
         "eor	v0.16b, v0.16b, v4.16b\n\t"
         "sub	w7, %w[nr], #2\n\t"
         "\n"
-    "L_AES_ECB_decrypt_NEON_loop_nr_1_%=: \n\t"
+    "L_AES_ECB_decrypt_NEON_loop_nr_1_%=:\n\t"
         "eor	v8.16b, v0.16b, v12.16b\n\t"
         "eor	v9.16b, v0.16b, v13.16b\n\t"
         "eor	v10.16b, v0.16b, v14.16b\n\t"
@@ -46697,7 +46704,7 @@ void AES_ECB_decrypt_NEON(const unsigned char* in, unsigned char* out,
         "rev32	v0.16b, v0.16b\n\t"
         "st1	{v0.16b}, [%x[out]], #16\n\t"
         "\n"
-    "L_AES_ECB_decrypt_NEON_data_done_%=: \n\t"
+    "L_AES_ECB_decrypt_NEON_data_done_%=:\n\t"
         : [out] "+r" (out), [len] "+r" (len), [nr] "+r" (nr)
         : [in] "r" (in), [ks] "r" (ks), [td] "r" (td),
           [invshuffle] "r" (invshuffle)
@@ -46729,7 +46736,7 @@ void AES_CBC_decrypt_NEON(const unsigned char* in, unsigned char* out,
         "cmp	%x[len], #0x40\n\t"
         "b.lt	L_AES_CBC_decrypt_NEON_start_2_%=\n\t"
         "\n"
-    "L_AES_CBC_decrypt_NEON_loop_4_%=: \n\t"
+    "L_AES_CBC_decrypt_NEON_loop_4_%=:\n\t"
         "mov	x9, %x[ks]\n\t"
         "ld1	{v4.16b, v5.16b, v6.16b, v7.16b}, [%x[in]], #0x40\n\t"
         "st1	{v3.2d, v4.2d, v5.2d, v6.2d}, [x10]\n\t"
@@ -46746,7 +46753,7 @@ void AES_CBC_decrypt_NEON(const unsigned char* in, unsigned char* out,
         "eor	v7.16b, v7.16b, v8.16b\n\t"
         "sub	w8, %w[nr], #2\n\t"
         "\n"
-    "L_AES_CBC_decrypt_NEON_loop_nr_4_%=: \n\t"
+    "L_AES_CBC_decrypt_NEON_loop_nr_4_%=:\n\t"
         "tbl	v8.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b\n\t"
         "tbl	v9.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v5.16b\n\t"
         "tbl	v10.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v6.16b\n\t"
@@ -47290,12 +47297,12 @@ void AES_CBC_decrypt_NEON(const unsigned char* in, unsigned char* out,
         "cmp	%x[len], #0x40\n\t"
         "b.ge	L_AES_CBC_decrypt_NEON_loop_4_%=\n\t"
         "\n"
-    "L_AES_CBC_decrypt_NEON_start_2_%=: \n\t"
+    "L_AES_CBC_decrypt_NEON_start_2_%=:\n\t"
         "cmp	%x[len], #16\n\t"
         "b.eq	L_AES_CBC_decrypt_NEON_start_1_%=\n\t"
         "b.lt	L_AES_CBC_decrypt_NEON_data_done_%=\n\t"
         "\n"
-    "L_AES_CBC_decrypt_NEON_loop_2_%=: \n\t"
+    "L_AES_CBC_decrypt_NEON_loop_2_%=:\n\t"
         "mov	x9, %x[ks]\n\t"
         "ld1	{v4.16b, v5.16b}, [%x[in]], #32\n\t"
         "st1	{v3.2d, v4.2d, v5.2d}, [x10]\n\t"
@@ -47307,7 +47314,7 @@ void AES_CBC_decrypt_NEON(const unsigned char* in, unsigned char* out,
         "eor	v5.16b, v5.16b, v8.16b\n\t"
         "sub	w8, %w[nr], #2\n\t"
         "\n"
-    "L_AES_CBC_decrypt_NEON_loop_nr_2_%=: \n\t"
+    "L_AES_CBC_decrypt_NEON_loop_nr_2_%=:\n\t"
         "movi	v12.16b, #0x40\n\t"
         "movi	v13.16b, #0x80\n\t"
         "movi	v14.16b, #0xc0\n\t"
@@ -47599,7 +47606,7 @@ void AES_CBC_decrypt_NEON(const unsigned char* in, unsigned char* out,
         "cmp	%x[len], #0\n\t"
         "b.eq	L_AES_CBC_decrypt_NEON_data_done_%=\n\t"
         "\n"
-    "L_AES_CBC_decrypt_NEON_start_1_%=: \n\t"
+    "L_AES_CBC_decrypt_NEON_start_1_%=:\n\t"
         "movi	v12.16b, #0x40\n\t"
         "movi	v13.16b, #0x80\n\t"
         "movi	v14.16b, #0xc0\n\t"
@@ -47615,7 +47622,7 @@ void AES_CBC_decrypt_NEON(const unsigned char* in, unsigned char* out,
         "eor	v4.16b, v4.16b, v8.16b\n\t"
         "sub	w8, %w[nr], #2\n\t"
         "\n"
-    "L_AES_CBC_decrypt_NEON_loop_nr_1_%=: \n\t"
+    "L_AES_CBC_decrypt_NEON_loop_nr_1_%=:\n\t"
         "eor	v0.16b, v4.16b, v12.16b\n\t"
         "eor	v1.16b, v4.16b, v13.16b\n\t"
         "eor	v2.16b, v4.16b, v14.16b\n\t"
@@ -47754,7 +47761,7 @@ void AES_CBC_decrypt_NEON(const unsigned char* in, unsigned char* out,
         "eor	v4.16b, v4.16b, v10.16b\n\t"
         "st1	{v4.16b}, [%x[out]], #16\n\t"
         "\n"
-    "L_AES_CBC_decrypt_NEON_data_done_%=: \n\t"
+    "L_AES_CBC_decrypt_NEON_data_done_%=:\n\t"
         "st1	{v3.2d}, [%x[iv]]\n\t"
         "ldp	x29, x30, [sp], #0x60\n\t"
         : [out] "+r" (out), [len] "+r" (len), [nr] "+r" (nr), [iv] "+r" (iv)
@@ -47788,7 +47795,7 @@ void GCM_gmult_len_NEON(unsigned char* x, const unsigned char* h,
         "ushr	v13.16b, v10.16b, #4\n\t"
         "eor	v14.16b, v12.16b, v13.16b\n\t"
         "\n"
-    "L_GCM_gmult_len_NEON_start_block_%=: \n\t"
+    "L_GCM_gmult_len_NEON_start_block_%=:\n\t"
         "ld1	{v0.16b}, [%x[data]], #16\n\t"
         "rbit	v0.16b, v0.16b\n\t"
         "eor	v18.16b, v18.16b, v0.16b\n\t"
@@ -48122,7 +48129,7 @@ void AES_GCM_encrypt_NEON(const unsigned char* in, unsigned char* out,
         "mov	x7, v2.d[0]\n\t"
         "mov	x8, v2.d[1]\n\t"
         "\n"
-    "L_AES_GCM_encrypt_NEON_loop_4_%=: \n\t"
+    "L_AES_GCM_encrypt_NEON_loop_4_%=:\n\t"
         "mov	x12, %x[ks]\n\t"
         "ld1	{v4.2d}, [x12], #16\n\t"
         "mov	v8.d[0], x7\n\t"
@@ -48142,7 +48149,7 @@ void AES_GCM_encrypt_NEON(const unsigned char* in, unsigned char* out,
         "eor	v3.16b, v8.16b, v4.16b\n\t"
         "sub	w11, %w[nr], #2\n\t"
         "\n"
-    "L_AES_GCM_encrypt_NEON_loop_nr_4_%=: \n\t"
+    "L_AES_GCM_encrypt_NEON_loop_nr_4_%=:\n\t"
         "tbl	v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b\n\t"
         "tbl	v5.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v1.16b\n\t"
         "tbl	v6.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v2.16b\n\t"
@@ -48541,7 +48548,7 @@ void AES_GCM_encrypt_NEON(const unsigned char* in, unsigned char* out,
         "mov	v2.d[1], x8\n\t"
         "mov	v2.s[3], w6\n\t"
         "\n"
-    "L_AES_GCM_encrypt_NEON_start_2_%=: \n\t"
+    "L_AES_GCM_encrypt_NEON_start_2_%=:\n\t"
         "movi	v12.16b, #0x40\n\t"
         "movi	v13.16b, #0x80\n\t"
         "movi	v14.16b, #0xc0\n\t"
@@ -48550,7 +48557,7 @@ void AES_GCM_encrypt_NEON(const unsigned char* in, unsigned char* out,
         "b.eq	L_AES_GCM_encrypt_NEON_start_1_%=\n\t"
         "b.lt	L_AES_GCM_encrypt_NEON_data_done_%=\n\t"
         "\n"
-    "L_AES_GCM_encrypt_NEON_loop_2_%=: \n\t"
+    "L_AES_GCM_encrypt_NEON_loop_2_%=:\n\t"
         "mov	x12, %x[ks]\n\t"
         "ld1	{v4.2d}, [x12], #16\n\t"
         /* Round: 0 - XOR in key schedule */
@@ -48562,7 +48569,7 @@ void AES_GCM_encrypt_NEON(const unsigned char* in, unsigned char* out,
         "eor	v1.16b, v2.16b, v4.16b\n\t"
         "sub	w11, %w[nr], #2\n\t"
         "\n"
-    "L_AES_GCM_encrypt_NEON_loop_nr_2_%=: \n\t"
+    "L_AES_GCM_encrypt_NEON_loop_nr_2_%=:\n\t"
         "eor	v8.16b, v0.16b, v12.16b\n\t"
         "eor	v9.16b, v1.16b, v12.16b\n\t"
         "tbl	v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b\n\t"
@@ -48765,7 +48772,7 @@ void AES_GCM_encrypt_NEON(const unsigned char* in, unsigned char* out,
         "cmp	%x[len], #0\n\t"
         "b.eq	L_AES_GCM_encrypt_NEON_data_done_%=\n\t"
         "\n"
-    "L_AES_GCM_encrypt_NEON_start_1_%=: \n\t"
+    "L_AES_GCM_encrypt_NEON_start_1_%=:\n\t"
         "ld1	{v3.2d}, [%[shuffle]]\n\t"
         "mov	x12, %x[ks]\n\t"
         "add	w6, w6, #1\n\t"
@@ -48775,7 +48782,7 @@ void AES_GCM_encrypt_NEON(const unsigned char* in, unsigned char* out,
         "eor	v0.16b, v2.16b, v4.16b\n\t"
         "sub	w11, %w[nr], #2\n\t"
         "\n"
-    "L_AES_GCM_encrypt_NEON_loop_nr_1_%=: \n\t"
+    "L_AES_GCM_encrypt_NEON_loop_nr_1_%=:\n\t"
         "eor	v8.16b, v0.16b, v12.16b\n\t"
         "eor	v9.16b, v0.16b, v13.16b\n\t"
         "eor	v10.16b, v0.16b, v14.16b\n\t"
@@ -48878,7 +48885,7 @@ void AES_GCM_encrypt_NEON(const unsigned char* in, unsigned char* out,
         "eor	v0.16b, v0.16b, v4.16b\n\t"
         "st1	{v0.16b}, [%x[out]], #16\n\t"
         "\n"
-    "L_AES_GCM_encrypt_NEON_data_done_%=: \n\t"
+    "L_AES_GCM_encrypt_NEON_data_done_%=:\n\t"
         "rev32	v2.16b, v2.16b\n\t"
         "st1	{v2.2d}, [%x[ctr]]\n\t"
         : [out] "+r" (out), [len] "+r" (len), [nr] "+r" (nr), [ctr] "+r" (ctr)
@@ -48919,7 +48926,7 @@ void AES_XTS_encrypt_NEON(const byte* in, byte* out, word32 sz, const byte* i,
         "eor	v2.16b, v2.16b, v4.16b\n\t"
         "sub	w21, %w[nr], #2\n\t"
         "\n"
-    "L_AES_XTS_encrypt_NEON_loop_nr_tweak_%=: \n\t"
+    "L_AES_XTS_encrypt_NEON_loop_nr_tweak_%=:\n\t"
         "eor	v8.16b, v2.16b, v12.16b\n\t"
         "eor	v9.16b, v2.16b, v13.16b\n\t"
         "eor	v10.16b, v2.16b, v14.16b\n\t"
@@ -49023,7 +49030,7 @@ void AES_XTS_encrypt_NEON(const byte* in, byte* out, word32 sz, const byte* i,
         "cmp	%w[sz], #0x40\n\t"
         "b.lt	L_AES_XTS_encrypt_NEON_start_2_%=\n\t"
         "\n"
-    "L_AES_XTS_encrypt_NEON_loop_4_%=: \n\t"
+    "L_AES_XTS_encrypt_NEON_loop_4_%=:\n\t"
         "mov	x22, %x[key]\n\t"
         "ld1	{v0.16b, v1.16b, v2.16b, v3.16b}, [%x[in]], #0x40\n\t"
         "ld1	{v4.16b}, [x22], #16\n\t"
@@ -49058,7 +49065,7 @@ void AES_XTS_encrypt_NEON(const byte* in, byte* out, word32 sz, const byte* i,
         "eor	v3.16b, v3.16b, v4.16b\n\t"
         "sub	w21, %w[nr], #2\n\t"
         "\n"
-    "L_AES_XTS_encrypt_NEON_loop_nr_4_%=: \n\t"
+    "L_AES_XTS_encrypt_NEON_loop_nr_4_%=:\n\t"
         "tbl	v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b\n\t"
         "tbl	v5.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v1.16b\n\t"
         "tbl	v6.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v2.16b\n\t"
@@ -49468,7 +49475,7 @@ void AES_XTS_encrypt_NEON(const byte* in, byte* out, word32 sz, const byte* i,
         "movi	v14.16b, #0xc0\n\t"
         "movi	v15.16b, #27\n\t"
         "\n"
-    "L_AES_XTS_encrypt_NEON_start_2_%=: \n\t"
+    "L_AES_XTS_encrypt_NEON_start_2_%=:\n\t"
         "cmp	%w[sz], #32\n\t"
         "b.lt	L_AES_XTS_encrypt_NEON_start_1_%=\n\t"
         "mov	x22, %x[key]\n\t"
@@ -49492,7 +49499,7 @@ void AES_XTS_encrypt_NEON(const byte* in, byte* out, word32 sz, const byte* i,
         "eor	v1.16b, v1.16b, v4.16b\n\t"
         "sub	w21, %w[nr], #2\n\t"
         "\n"
-    "L_AES_XTS_encrypt_NEON_loop_nr_2_%=: \n\t"
+    "L_AES_XTS_encrypt_NEON_loop_nr_2_%=:\n\t"
         "eor	v8.16b, v0.16b, v12.16b\n\t"
         "eor	v9.16b, v1.16b, v12.16b\n\t"
         "tbl	v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b\n\t"
@@ -49695,7 +49702,7 @@ void AES_XTS_encrypt_NEON(const byte* in, byte* out, word32 sz, const byte* i,
         "eor	x8, x16, x10, lsl 1\n\t"
         "sub	%w[sz], %w[sz], #32\n\t"
         "\n"
-    "L_AES_XTS_encrypt_NEON_start_1_%=: \n\t"
+    "L_AES_XTS_encrypt_NEON_start_1_%=:\n\t"
         "ld1	{v3.2d}, [%[shuffle]]\n\t"
         "mov	v2.d[0], x8\n\t"
         "mov	v2.d[1], x9\n\t"
@@ -49709,7 +49716,7 @@ void AES_XTS_encrypt_NEON(const byte* in, byte* out, word32 sz, const byte* i,
         "eor	v0.16b, v0.16b, v4.16b\n\t"
         "sub	w21, %w[nr], #2\n\t"
         "\n"
-    "L_AES_XTS_encrypt_NEON_loop_nr_1_%=: \n\t"
+    "L_AES_XTS_encrypt_NEON_loop_nr_1_%=:\n\t"
         "eor	v8.16b, v0.16b, v12.16b\n\t"
         "eor	v9.16b, v0.16b, v13.16b\n\t"
         "eor	v10.16b, v0.16b, v14.16b\n\t"
@@ -49816,7 +49823,7 @@ void AES_XTS_encrypt_NEON(const byte* in, byte* out, word32 sz, const byte* i,
         "extr	x9, x9, x8, #63\n\t"
         "eor	x8, x16, x8, lsl 1\n\t"
         "\n"
-    "L_AES_XTS_encrypt_NEON_start_partial_%=: \n\t"
+    "L_AES_XTS_encrypt_NEON_start_partial_%=:\n\t"
         "cbz	%w[sz], L_AES_XTS_encrypt_NEON_data_done_%=\n\t"
         "mov	v2.d[0], x8\n\t"
         "mov	v2.d[1], x9\n\t"
@@ -49826,7 +49833,7 @@ void AES_XTS_encrypt_NEON(const byte* in, byte* out, word32 sz, const byte* i,
         "st1	{v0.2d}, [%x[tmp]]\n\t"
         "mov	w16, %w[sz]\n\t"
         "\n"
-    "L_AES_XTS_encrypt_NEON_start_byte_%=: \n\t"
+    "L_AES_XTS_encrypt_NEON_start_byte_%=:\n\t"
         "ldrb	w10, [%x[tmp]]\n\t"
         "ldrb	w11, [%x[in]], #1\n\t"
         "strb	w10, [%x[out]], #1\n\t"
@@ -49843,7 +49850,7 @@ void AES_XTS_encrypt_NEON(const byte* in, byte* out, word32 sz, const byte* i,
         "eor	v0.16b, v0.16b, v4.16b\n\t"
         "sub	w21, %w[nr], #2\n\t"
         "\n"
-    "L_AES_XTS_encrypt_NEON_loop_nr_partial_%=: \n\t"
+    "L_AES_XTS_encrypt_NEON_loop_nr_partial_%=:\n\t"
         "eor	v8.16b, v0.16b, v12.16b\n\t"
         "eor	v9.16b, v0.16b, v13.16b\n\t"
         "eor	v10.16b, v0.16b, v14.16b\n\t"
@@ -49945,7 +49952,7 @@ void AES_XTS_encrypt_NEON(const byte* in, byte* out, word32 sz, const byte* i,
         "eor	v0.16b, v0.16b, v2.16b\n\t"
         "st1	{v0.16b}, [%x[out]]\n\t"
         "\n"
-    "L_AES_XTS_encrypt_NEON_data_done_%=: \n\t"
+    "L_AES_XTS_encrypt_NEON_data_done_%=:\n\t"
         "ldp	x29, x30, [sp], #32\n\t"
         : [out] "+r" (out), [sz] "+r" (sz), [key] "+r" (key),
           [key2] "+r" (key2), [tmp] "+r" (tmp), [nr] "+r" (nr)
@@ -49991,7 +49998,7 @@ void AES_XTS_decrypt_NEON(const byte* in, byte* out, word32 sz, const byte* i,
         "eor	v2.16b, v2.16b, v4.16b\n\t"
         "sub	w24, %w[nr], #2\n\t"
         "\n"
-    "L_AES_XTS_decrypt_NEON_loop_nr_tweak_%=: \n\t"
+    "L_AES_XTS_decrypt_NEON_loop_nr_tweak_%=:\n\t"
         "eor	v8.16b, v2.16b, v12.16b\n\t"
         "eor	v9.16b, v2.16b, v13.16b\n\t"
         "eor	v10.16b, v2.16b, v14.16b\n\t"
@@ -50100,7 +50107,7 @@ void AES_XTS_decrypt_NEON(const byte* in, byte* out, word32 sz, const byte* i,
         "cmp	%w[sz], #0x40\n\t"
         "b.lt	L_AES_XTS_decrypt_NEON_start_2_%=\n\t"
         "\n"
-    "L_AES_XTS_decrypt_NEON_loop_4_%=: \n\t"
+    "L_AES_XTS_decrypt_NEON_loop_4_%=:\n\t"
         "mov	x25, %x[key]\n\t"
         "ld1	{v0.16b, v1.16b, v2.16b, v3.16b}, [%x[in]], #0x40\n\t"
         "ld1	{v4.16b}, [x25], #16\n\t"
@@ -50135,7 +50142,7 @@ void AES_XTS_decrypt_NEON(const byte* in, byte* out, word32 sz, const byte* i,
         "eor	v3.16b, v3.16b, v4.16b\n\t"
         "sub	w24, %w[nr], #2\n\t"
         "\n"
-    "L_AES_XTS_decrypt_NEON_loop_nr_4_%=: \n\t"
+    "L_AES_XTS_decrypt_NEON_loop_nr_4_%=:\n\t"
         "tbl	v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b\n\t"
         "tbl	v5.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v1.16b\n\t"
         "tbl	v6.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v2.16b\n\t"
@@ -50692,7 +50699,7 @@ void AES_XTS_decrypt_NEON(const byte* in, byte* out, word32 sz, const byte* i,
         "movi	v14.16b, #0xc0\n\t"
         "movi	v15.16b, #27\n\t"
         "\n"
-    "L_AES_XTS_decrypt_NEON_start_2_%=: \n\t"
+    "L_AES_XTS_decrypt_NEON_start_2_%=:\n\t"
         "cmp	%w[sz], #32\n\t"
         "b.lt	L_AES_XTS_decrypt_NEON_start_1_%=\n\t"
         "mov	x25, %x[key]\n\t"
@@ -50716,7 +50723,7 @@ void AES_XTS_decrypt_NEON(const byte* in, byte* out, word32 sz, const byte* i,
         "eor	v1.16b, v1.16b, v4.16b\n\t"
         "sub	w24, %w[nr], #2\n\t"
         "\n"
-    "L_AES_XTS_decrypt_NEON_loop_nr_2_%=: \n\t"
+    "L_AES_XTS_decrypt_NEON_loop_nr_2_%=:\n\t"
         "movi	v12.16b, #0x40\n\t"
         "movi	v13.16b, #0x80\n\t"
         "movi	v14.16b, #0xc0\n\t"
@@ -51006,7 +51013,7 @@ void AES_XTS_decrypt_NEON(const byte* in, byte* out, word32 sz, const byte* i,
         "eor	x8, x16, x10, lsl 1\n\t"
         "sub	%w[sz], %w[sz], #32\n\t"
         "\n"
-    "L_AES_XTS_decrypt_NEON_start_1_%=: \n\t"
+    "L_AES_XTS_decrypt_NEON_start_1_%=:\n\t"
         "ld1	{v3.2d}, [%[invshuffle]]\n\t"
         "mov	v2.d[0], x8\n\t"
         "mov	v2.d[1], x9\n\t"
@@ -51020,7 +51027,7 @@ void AES_XTS_decrypt_NEON(const byte* in, byte* out, word32 sz, const byte* i,
         "eor	v0.16b, v0.16b, v4.16b\n\t"
         "sub	w24, %w[nr], #2\n\t"
         "\n"
-    "L_AES_XTS_decrypt_NEON_loop_nr_1_%=: \n\t"
+    "L_AES_XTS_decrypt_NEON_loop_nr_1_%=:\n\t"
         "eor	v8.16b, v0.16b, v12.16b\n\t"
         "eor	v9.16b, v0.16b, v13.16b\n\t"
         "eor	v10.16b, v0.16b, v14.16b\n\t"
@@ -51163,7 +51170,7 @@ void AES_XTS_decrypt_NEON(const byte* in, byte* out, word32 sz, const byte* i,
         "extr	x9, x9, x8, #63\n\t"
         "eor	x8, x16, x8, lsl 1\n\t"
         "\n"
-    "L_AES_XTS_decrypt_NEON_start_partial_%=: \n\t"
+    "L_AES_XTS_decrypt_NEON_start_partial_%=:\n\t"
         "mov	%w[sz], w19\n\t"
         "cbz	%w[sz], L_AES_XTS_decrypt_NEON_data_done_%=\n\t"
         "mov	v2.d[0], x8\n\t"
@@ -51181,7 +51188,7 @@ void AES_XTS_decrypt_NEON(const byte* in, byte* out, word32 sz, const byte* i,
         "eor	v0.16b, v0.16b, v4.16b\n\t"
         "sub	w24, %w[nr], #2\n\t"
         "\n"
-    "L_AES_XTS_decrypt_NEON_loop_nr_partial_1_%=: \n\t"
+    "L_AES_XTS_decrypt_NEON_loop_nr_partial_1_%=:\n\t"
         "eor	v8.16b, v0.16b, v12.16b\n\t"
         "eor	v9.16b, v0.16b, v13.16b\n\t"
         "eor	v10.16b, v0.16b, v14.16b\n\t"
@@ -51321,7 +51328,7 @@ void AES_XTS_decrypt_NEON(const byte* in, byte* out, word32 sz, const byte* i,
         "add	%x[out], %x[out], #16\n\t"
         "mov	w16, %w[sz]\n\t"
         "\n"
-    "L_AES_XTS_decrypt_NEON_start_byte_%=: \n\t"
+    "L_AES_XTS_decrypt_NEON_start_byte_%=:\n\t"
         "ldrb	w10, [%x[tmp]]\n\t"
         "ldrb	w11, [%x[in]], #1\n\t"
         "strb	w10, [%x[out]], #1\n\t"
@@ -51339,7 +51346,7 @@ void AES_XTS_decrypt_NEON(const byte* in, byte* out, word32 sz, const byte* i,
         "eor	v0.16b, v0.16b, v4.16b\n\t"
         "sub	w24, %w[nr], #2\n\t"
         "\n"
-    "L_AES_XTS_decrypt_NEON_loop_nr_partial_2_%=: \n\t"
+    "L_AES_XTS_decrypt_NEON_loop_nr_partial_2_%=:\n\t"
         "eor	v8.16b, v0.16b, v12.16b\n\t"
         "eor	v9.16b, v0.16b, v13.16b\n\t"
         "eor	v10.16b, v0.16b, v14.16b\n\t"
@@ -51477,7 +51484,7 @@ void AES_XTS_decrypt_NEON(const byte* in, byte* out, word32 sz, const byte* i,
         "eor	v0.16b, v0.16b, v2.16b\n\t"
         "st1	{v0.16b}, [%x[out]]\n\t"
         "\n"
-    "L_AES_XTS_decrypt_NEON_data_done_%=: \n\t"
+    "L_AES_XTS_decrypt_NEON_data_done_%=:\n\t"
         "ldp	x29, x30, [sp], #32\n\t"
         : [out] "+r" (out), [sz] "+r" (sz), [key] "+r" (key),
           [key2] "+r" (key2), [tmp] "+r" (tmp), [nr] "+r" (nr)
@@ -51496,7 +51503,7 @@ void AES_XTS_decrypt_NEON(const byte* in, byte* out, word32 sz, const byte* i,
 #endif /* !WOLFSSL_ARMASM_NO_NEON */
 #ifndef WOLFSSL_ARMASM_NEON_NO_TABLE_LOOKUP
 #ifdef HAVE_AES_DECRYPT
-static const word32 L_AES_ARM64_td[] = {
+XALIGNED(8) static const word32 L_AES_ARM64_td[] = {
     0x5051f4a7, 0x537e4165, 0xc31a17a4, 0x963a275e,
     0xcb3bab6b, 0xf11f9d45, 0xabacfa58, 0x934be303,
     0x552030fa, 0xf6ad766d, 0x9188cc76, 0x25f5024c,
@@ -51567,7 +51574,7 @@ static const word32 L_AES_ARM64_td[] = {
 #if defined(HAVE_AES_DECRYPT) || defined(HAVE_AES_CBC) || \
     defined(HAVE_AESCCM) || defined(HAVE_AESGCM) || \
     defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER)
-static const word32 L_AES_ARM64_te[] = {
+XALIGNED(8) static const word32 L_AES_ARM64_te[] = {
     0xa5c66363, 0x84f87c7c, 0x99ee7777, 0x8df67b7b,
     0x0dfff2f2, 0xbdd66b6b, 0xb1de6f6f, 0x5491c5c5,
     0x50603030, 0x03020101, 0xa9ce6767, 0x7d562b2b,
@@ -51646,7 +51653,7 @@ void AES_invert_key(unsigned char* ks, word32 rounds)
         "add	x12, %x[ks], %x[rounds], lsl 4\n\t"
         "mov	w13, %w[rounds]\n\t"
         "\n"
-    "L_AES_invert_key_loop_%=: \n\t"
+    "L_AES_invert_key_loop_%=:\n\t"
         "ldp	w4, w5, [%x[ks]]\n\t"
         "ldnp	w6, w7, [%x[ks], #8]\n\t"
         "ldp	w8, w9, [x12]\n\t"
@@ -51662,7 +51669,7 @@ void AES_invert_key(unsigned char* ks, word32 rounds)
         "add	%x[ks], %x[ks], #16\n\t"
         "sub	w13, %w[rounds], #1\n\t"
         "\n"
-    "L_AES_invert_key_mix_loop_%=: \n\t"
+    "L_AES_invert_key_mix_loop_%=:\n\t"
         "ldp	w4, w5, [%x[ks]]\n\t"
         "ldnp	w6, w7, [%x[ks], #8]\n\t"
         "ubfx	w8, w4, #0, #8\n\t"
@@ -51755,10 +51762,10 @@ void AES_invert_key(unsigned char* ks, word32 rounds)
 }
 
 #endif /* HAVE_AES_DECRYPT */
-static const word32 L_AES_ARM64_rcon[] = {
+XALIGNED(8) static const word32 L_AES_ARM64_rcon[] = {
     0x01000000, 0x02000000, 0x04000000, 0x08000000,
     0x10000000, 0x20000000, 0x40000000, 0x80000000,
-    0x1b000000, 0x36000000,
+    0x1b000000, 0x36000000
 };
 
 void AES_set_encrypt_key(const unsigned char* key, word32 len,
@@ -51796,7 +51803,7 @@ void AES_set_encrypt_key(const unsigned char* key, word32 len,
         "sub	%x[ks], %x[ks], #16\n\t"
         "mov	x4, #6\n\t"
         "\n"
-    "L_AES_set_encrypt_key_loop_256_%=: \n\t"
+    "L_AES_set_encrypt_key_loop_256_%=:\n\t"
         "ubfx	w6, w9, #0, #8\n\t"
         "ubfx	w7, w9, #8, #8\n\t"
         "ubfx	w8, w9, #16, #8\n\t"
@@ -51881,7 +51888,7 @@ void AES_set_encrypt_key(const unsigned char* key, word32 len,
         "sub	%x[ks], %x[ks], #16\n\t"
         "b	L_AES_set_encrypt_key_end_%=\n\t"
         "\n"
-    "L_AES_set_encrypt_key_start_192_%=: \n\t"
+    "L_AES_set_encrypt_key_start_192_%=:\n\t"
         "ldr	w6, [%x[key]]\n\t"
         "ldr	w7, [%x[key], #4]\n\t"
         "ldr	w8, [%x[key], #8]\n\t"
@@ -51899,7 +51906,7 @@ void AES_set_encrypt_key(const unsigned char* key, word32 len,
         "stnp	w10, w11, [%x[ks], #16]\n\t"
         "mov	x4, #7\n\t"
         "\n"
-    "L_AES_set_encrypt_key_loop_192_%=: \n\t"
+    "L_AES_set_encrypt_key_loop_192_%=:\n\t"
         "ubfx	w6, w11, #0, #8\n\t"
         "ubfx	w7, w11, #8, #8\n\t"
         "ubfx	w8, w11, #16, #8\n\t"
@@ -51959,7 +51966,7 @@ void AES_set_encrypt_key(const unsigned char* key, word32 len,
         "stnp	w8, w9, [%x[ks], #8]\n\t"
         "b	L_AES_set_encrypt_key_end_%=\n\t"
         "\n"
-    "L_AES_set_encrypt_key_start_128_%=: \n\t"
+    "L_AES_set_encrypt_key_start_128_%=:\n\t"
         "ldr	w6, [%x[key]]\n\t"
         "ldr	w7, [%x[key], #4]\n\t"
         "ldr	w8, [%x[key], #8]\n\t"
@@ -51972,7 +51979,7 @@ void AES_set_encrypt_key(const unsigned char* key, word32 len,
         "stnp	w8, w9, [%x[ks], #8]\n\t"
         "mov	x4, #10\n\t"
         "\n"
-    "L_AES_set_encrypt_key_loop_128_%=: \n\t"
+    "L_AES_set_encrypt_key_loop_128_%=:\n\t"
         "ubfx	w6, w9, #0, #8\n\t"
         "ubfx	w7, w9, #8, #8\n\t"
         "ubfx	w8, w9, #16, #8\n\t"
@@ -52001,7 +52008,7 @@ void AES_set_encrypt_key(const unsigned char* key, word32 len,
         "subs	x4, x4, #1\n\t"
         "b.ne	L_AES_set_encrypt_key_loop_128_%=\n\t"
         "\n"
-    "L_AES_set_encrypt_key_end_%=: \n\t"
+    "L_AES_set_encrypt_key_end_%=:\n\t"
         : [len] "+r" (len), [ks] "+r" (ks)
         : [key] "r" (key), [rcon] "r" (rcon), [te] "r" (te)
         : "memory", "cc", "x3", "x4", "x6", "x7", "x8", "x9", "x10", "x11"
@@ -52019,7 +52026,7 @@ void AES_ECB_encrypt(const unsigned char* in, unsigned char* out,
     const word32* te = L_AES_ARM64_te;
     __asm__ __volatile__ (
         "\n"
-    "L_AES_ECB_encrypt_loop_block_128_%=: \n\t"
+    "L_AES_ECB_encrypt_loop_block_128_%=:\n\t"
         "mov	x17, %x[ks]\n\t"
         "ldr	x6, [%x[in]]\n\t"
         "ldr	x7, [%x[in], #8]\n\t"
@@ -52031,7 +52038,7 @@ void AES_ECB_encrypt(const unsigned char* in, unsigned char* out,
         "eor	x7, x7, x11\n\t"
         "sub	w16, %w[nr], #2\n\t"
         "\n"
-    "L_AES_ECB_encrypt_loop_nr_%=: \n\t"
+    "L_AES_ECB_encrypt_loop_nr_%=:\n\t"
         "ubfx	x10, x6, #48, #8\n\t"
         "ubfx	x13, x6, #24, #8\n\t"
         "ubfx	x14, x7, #8, #8\n\t"
@@ -52341,7 +52348,7 @@ void AES_CBC_encrypt(const unsigned char* in, unsigned char* out,
     __asm__ __volatile__ (
         "ldp	x7, x8, [%x[iv]]\n\t"
         "\n"
-    "L_AES_CBC_encrypt_loop_block_%=: \n\t"
+    "L_AES_CBC_encrypt_loop_block_%=:\n\t"
         "mov	x19, %x[ks]\n\t"
         "ldr	x11, [%x[in]]\n\t"
         "ldr	x12, [%x[in], #8]\n\t"
@@ -52355,7 +52362,7 @@ void AES_CBC_encrypt(const unsigned char* in, unsigned char* out,
         "eor	x8, x8, x12\n\t"
         "sub	w17, %w[nr], #2\n\t"
         "\n"
-    "L_AES_CBC_encrypt_loop_nr_%=: \n\t"
+    "L_AES_CBC_encrypt_loop_nr_%=:\n\t"
         "ubfx	x11, x7, #48, #8\n\t"
         "ubfx	x14, x7, #24, #8\n\t"
         "ubfx	x15, x8, #8, #8\n\t"
@@ -52667,7 +52674,7 @@ void AES_CTR_encrypt(const unsigned char* in, unsigned char* out,
         "rev32	x15, x15\n\t"
         "rev32	x16, x16\n\t"
         "\n"
-    "L_AES_CTR_encrypt_loop_block_128_%=: \n\t"
+    "L_AES_CTR_encrypt_loop_block_128_%=:\n\t"
         "mov	x21, %x[ks]\n\t"
         "ldp	x11, x12, [x21], #16\n\t"
         /* Round: 0 - XOR in key schedule */
@@ -52675,7 +52682,7 @@ void AES_CTR_encrypt(const unsigned char* in, unsigned char* out,
         "eor	x8, x16, x12\n\t"
         "sub	w20, %w[nr], #2\n\t"
         "\n"
-    "L_AES_CTR_encrypt_loop_nr_%=: \n\t"
+    "L_AES_CTR_encrypt_loop_nr_%=:\n\t"
         "ubfx	x11, x7, #48, #8\n\t"
         "ubfx	x14, x7, #24, #8\n\t"
         "ubfx	x17, x8, #8, #8\n\t"
@@ -52990,7 +52997,7 @@ void AES_CTR_encrypt(const unsigned char* in, unsigned char* out,
 #ifdef HAVE_AES_DECRYPT
 #if defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) || \
     defined(HAVE_AES_CBC) || defined(HAVE_AES_ECB)
-static const word8 L_AES_ARM64_td4[] = {
+XALIGNED(4) static const word8 L_AES_ARM64_td4[] = {
     0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38,
     0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
     0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87,
@@ -53025,7 +53032,8 @@ static const word8 L_AES_ARM64_td4[] = {
     0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d,
 };
 
-#if defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) || defined(HAVE_AES_ECB)
+#if defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) || \
+        defined(HAVE_AES_ECB)
 void AES_ECB_decrypt(const unsigned char* in, unsigned char* out,
     unsigned long len, const unsigned char* ks, int nr);
 void AES_ECB_decrypt(const unsigned char* in, unsigned char* out,
@@ -53035,7 +53043,7 @@ void AES_ECB_decrypt(const unsigned char* in, unsigned char* out,
     const word8* td4 = L_AES_ARM64_td4;
     __asm__ __volatile__ (
         "\n"
-    "L_AES_ECB_decrypt_loop_block_%=: \n\t"
+    "L_AES_ECB_decrypt_loop_block_%=:\n\t"
         "mov	x19, %x[ks]\n\t"
         "ldr	x7, [%x[in]]\n\t"
         "ldr	x8, [%x[in], #8]\n\t"
@@ -53047,7 +53055,7 @@ void AES_ECB_decrypt(const unsigned char* in, unsigned char* out,
         "eor	x8, x8, x12\n\t"
         "sub	w17, %w[nr], #2\n\t"
         "\n"
-    "L_AES_ECB_decrypt_loop_nr_%=: \n\t"
+    "L_AES_ECB_decrypt_loop_nr_%=:\n\t"
         "ubfx	x11, x8, #48, #8\n\t"
         "ubfx	x14, x7, #24, #8\n\t"
         "ubfx	x15, x8, #8, #8\n\t"
@@ -53328,7 +53336,7 @@ void AES_CBC_decrypt(const unsigned char* in, unsigned char* out,
     const word32* td = L_AES_ARM64_td;
     __asm__ __volatile__ (
         "\n"
-    "L_AES_CBC_decrypt_loop_block_%=: \n\t"
+    "L_AES_CBC_decrypt_loop_block_%=:\n\t"
         "mov	x20, %x[ks]\n\t"
         "ldr	x8, [%x[in]]\n\t"
         "ldr	x9, [%x[in], #8]\n\t"
@@ -53341,7 +53349,7 @@ void AES_CBC_decrypt(const unsigned char* in, unsigned char* out,
         "eor	x9, x9, x13\n\t"
         "sub	w19, %w[nr], #2\n\t"
         "\n"
-    "L_AES_CBC_decrypt_loop_nr_even_%=: \n\t"
+    "L_AES_CBC_decrypt_loop_nr_even_%=:\n\t"
         "ubfx	x12, x9, #48, #8\n\t"
         "ubfx	x15, x8, #24, #8\n\t"
         "ubfx	x16, x9, #8, #8\n\t"
@@ -53619,7 +53627,7 @@ void AES_CBC_decrypt(const unsigned char* in, unsigned char* out,
         "eor	x9, x9, x13\n\t"
         "sub	w19, %w[nr], #2\n\t"
         "\n"
-    "L_AES_CBC_decrypt_loop_nr_odd_%=: \n\t"
+    "L_AES_CBC_decrypt_loop_nr_odd_%=:\n\t"
         "ubfx	x12, x9, #48, #8\n\t"
         "ubfx	x15, x8, #24, #8\n\t"
         "ubfx	x16, x9, #8, #8\n\t"
@@ -53887,11 +53895,11 @@ void AES_CBC_decrypt(const unsigned char* in, unsigned char* out,
         "b.ne	L_AES_CBC_decrypt_loop_block_%=\n\t"
         "b	L_AES_CBC_decrypt_end_dec_%=\n\t"
         "\n"
-    "L_AES_CBC_decrypt_end_dec_odd_%=: \n\t"
+    "L_AES_CBC_decrypt_end_dec_odd_%=:\n\t"
         "ldnp	x12, x13, [%x[iv], #16]\n\t"
         "stp	x12, x13, [%x[iv]]\n\t"
         "\n"
-    "L_AES_CBC_decrypt_end_dec_%=: \n\t"
+    "L_AES_CBC_decrypt_end_dec_%=:\n\t"
         : [out] "+r" (out), [len] "+r" (len), [nr] "+r" (nr), [iv] "+r" (iv)
         : [in] "r" (in), [ks] "r" (ks), [td4] "r" (td4), [td] "r" (td)
         : "memory", "cc", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15",
@@ -53904,7 +53912,7 @@ void AES_CBC_decrypt(const unsigned char* in, unsigned char* out,
         * HAVE_AES_ECB */
 #endif /* HAVE_AES_DECRYPT */
 #ifdef HAVE_AESGCM
-static const word32 L_GCM_gmult_len_r[] = {
+XALIGNED(8) static const word32 L_GCM_gmult_len_r[] = {
     0x00000000, 0x1c200000, 0x38400000, 0x24600000,
     0x70800000, 0x6ca00000, 0x48c00000, 0x54e00000,
     0xe1000000, 0xfd200000, 0xd9400000, 0xc5600000,
@@ -53923,7 +53931,7 @@ void GCM_gmult_len(unsigned char* x, const unsigned char** m,
     const word32* r = L_GCM_gmult_len_r;
     __asm__ __volatile__ (
         "\n"
-    "L_GCM_gmult_len_start_block_%=: \n\t"
+    "L_GCM_gmult_len_start_block_%=:\n\t"
         "ldp	x4, x5, [%x[x]]\n\t"
         "ldp	x6, x7, [%x[data]]\n\t"
         "eor	x4, x4, x6\n\t"
@@ -54340,7 +54348,7 @@ void AES_GCM_encrypt(const unsigned char* in, unsigned char* out,
         "rev32	x16, x16\n\t"
         "rev32	x17, x17\n\t"
         "\n"
-    "L_AES_GCM_encrypt_loop_block_%=: \n\t"
+    "L_AES_GCM_encrypt_loop_block_%=:\n\t"
         "mov	x21, %x[ks]\n\t"
         "lsr	x9, x17, #32\n\t"
         "ldp	x10, x11, [x21], #16\n\t"
@@ -54351,7 +54359,7 @@ void AES_GCM_encrypt(const unsigned char* in, unsigned char* out,
         "eor	x7, x17, x11\n\t"
         "sub	w20, %w[nr], #2\n\t"
         "\n"
-    "L_AES_GCM_encrypt_loop_nr_%=: \n\t"
+    "L_AES_GCM_encrypt_loop_nr_%=:\n\t"
         "ubfx	x10, x6, #48, #8\n\t"
         "ubfx	x13, x6, #24, #8\n\t"
         "ubfx	x14, x7, #8, #8\n\t"
@@ -54676,7 +54684,7 @@ void AES_XTS_encrypt(const byte* in, byte* out, word32 sz, const byte* i,
         "eor	x22, x22, x15\n\t"
         "sub	w25, %w[nr], #2\n\t"
         "\n"
-    "L_AES_XTS_encrypt_loop_nr_tweak_%=: \n\t"
+    "L_AES_XTS_encrypt_loop_nr_tweak_%=:\n\t"
         "ubfx	x14, x21, #48, #8\n\t"
         "ubfx	x17, x21, #24, #8\n\t"
         "ubfx	x19, x22, #8, #8\n\t"
@@ -54962,7 +54970,7 @@ void AES_XTS_encrypt(const byte* in, byte* out, word32 sz, const byte* i,
         "rev32	x21, x21\n\t"
         "rev32	x22, x22\n\t"
         "\n"
-    "L_AES_XTS_encrypt_loop_block_%=: \n\t"
+    "L_AES_XTS_encrypt_loop_block_%=:\n\t"
         "mov	x26, %x[key]\n\t"
         "ldp	x10, x11, [%x[in]]\n\t"
         "ldp	x14, x15, [x26], #16\n\t"
@@ -54975,7 +54983,7 @@ void AES_XTS_encrypt(const byte* in, byte* out, word32 sz, const byte* i,
         "eor	x11, x11, x15\n\t"
         "sub	w25, %w[nr], #2\n\t"
         "\n"
-    "L_AES_XTS_encrypt_loop_nr_%=: \n\t"
+    "L_AES_XTS_encrypt_loop_nr_%=:\n\t"
         "ubfx	x14, x10, #48, #8\n\t"
         "ubfx	x17, x10, #24, #8\n\t"
         "ubfx	x19, x11, #8, #8\n\t"
@@ -55278,7 +55286,7 @@ void AES_XTS_encrypt(const byte* in, byte* out, word32 sz, const byte* i,
         "stp	x10, x11, [%x[tmp]]\n\t"
         "mov	w14, %w[sz]\n\t"
         "\n"
-    "L_AES_XTS_encrypt_start_byte_%=: \n\t"
+    "L_AES_XTS_encrypt_start_byte_%=:\n\t"
         "ldrb	w19, [%x[tmp]]\n\t"
         "ldrb	w20, [%x[in]], #1\n\t"
         "strb	w19, [%x[out]], #1\n\t"
@@ -55299,7 +55307,7 @@ void AES_XTS_encrypt(const byte* in, byte* out, word32 sz, const byte* i,
         "eor	x11, x11, x15\n\t"
         "sub	w25, %w[nr], #2\n\t"
         "\n"
-    "L_AES_XTS_encrypt_loop_nr_partial_%=: \n\t"
+    "L_AES_XTS_encrypt_loop_nr_partial_%=:\n\t"
         "ubfx	x14, x10, #48, #8\n\t"
         "ubfx	x17, x10, #24, #8\n\t"
         "ubfx	x19, x11, #8, #8\n\t"
@@ -55588,7 +55596,7 @@ void AES_XTS_encrypt(const byte* in, byte* out, word32 sz, const byte* i,
         "eor	x11, x11, x22\n\t"
         "stp	x10, x11, [%x[out]]\n\t"
         "\n"
-    "L_AES_XTS_encrypt_done_data_%=: \n\t"
+    "L_AES_XTS_encrypt_done_data_%=:\n\t"
         "ldp	x29, x30, [sp], #32\n\t"
         : [out] "+r" (out), [sz] "+r" (sz), [key] "+r" (key),
           [key2] "+r" (key2), [tmp] "+r" (tmp), [nr] "+r" (nr)
@@ -55623,7 +55631,7 @@ void AES_XTS_decrypt(const byte* in, byte* out, word32 sz, const byte* i,
         "eor	x24, x24, x17\n\t"
         "sub	w27, %w[nr], #2\n\t"
         "\n"
-    "L_AES_XTS_decrypt_loop_nr_tweak_%=: \n\t"
+    "L_AES_XTS_decrypt_loop_nr_tweak_%=:\n\t"
         "ubfx	x16, x23, #48, #8\n\t"
         "ubfx	x20, x23, #24, #8\n\t"
         "ubfx	x21, x24, #8, #8\n\t"
@@ -55911,7 +55919,7 @@ void AES_XTS_decrypt(const byte* in, byte* out, word32 sz, const byte* i,
         "cmp	%w[sz], #16\n\t"
         "b.lt	L_AES_XTS_decrypt_start_partail_%=\n\t"
         "\n"
-    "L_AES_XTS_decrypt_loop_block_%=: \n\t"
+    "L_AES_XTS_decrypt_loop_block_%=:\n\t"
         "mov	x28, %x[key]\n\t"
         "ldp	x12, x13, [%x[in]]\n\t"
         "ldp	x16, x17, [x28], #16\n\t"
@@ -55924,7 +55932,7 @@ void AES_XTS_decrypt(const byte* in, byte* out, word32 sz, const byte* i,
         "eor	x13, x13, x17\n\t"
         "sub	w27, %w[nr], #2\n\t"
         "\n"
-    "L_AES_XTS_decrypt_loop_nr_%=: \n\t"
+    "L_AES_XTS_decrypt_loop_nr_%=:\n\t"
         "ubfx	x16, x13, #48, #8\n\t"
         "ubfx	x20, x12, #24, #8\n\t"
         "ubfx	x21, x13, #8, #8\n\t"
@@ -56194,7 +56202,7 @@ void AES_XTS_decrypt(const byte* in, byte* out, word32 sz, const byte* i,
         "b.ge	L_AES_XTS_decrypt_loop_block_%=\n\t"
         "cbz	%w[sz], L_AES_XTS_decrypt_done_data_%=\n\t"
         "\n"
-    "L_AES_XTS_decrypt_start_partail_%=: \n\t"
+    "L_AES_XTS_decrypt_start_partail_%=:\n\t"
         "and	x21, x11, x24, asr 63\n\t"
         "extr	x26, x24, x23, #63\n\t"
         "eor	x25, x21, x23, lsl 1\n\t"
@@ -56210,7 +56218,7 @@ void AES_XTS_decrypt(const byte* in, byte* out, word32 sz, const byte* i,
         "eor	x13, x13, x17\n\t"
         "sub	w27, %w[nr], #2\n\t"
         "\n"
-    "L_AES_XTS_decrypt_loop_nr_partial_1_%=: \n\t"
+    "L_AES_XTS_decrypt_loop_nr_partial_1_%=:\n\t"
         "ubfx	x16, x13, #48, #8\n\t"
         "ubfx	x20, x12, #24, #8\n\t"
         "ubfx	x21, x13, #8, #8\n\t"
@@ -56473,7 +56481,7 @@ void AES_XTS_decrypt(const byte* in, byte* out, word32 sz, const byte* i,
         "add	%x[out], %x[out], #16\n\t"
         "mov	w16, %w[sz]\n\t"
         "\n"
-    "L_AES_XTS_decrypt_start_byte_%=: \n\t"
+    "L_AES_XTS_decrypt_start_byte_%=:\n\t"
         "ldrb	w21, [%x[tmp]]\n\t"
         "ldrb	w22, [%x[in]], #1\n\t"
         "strb	w21, [%x[out]], #1\n\t"
@@ -56495,7 +56503,7 @@ void AES_XTS_decrypt(const byte* in, byte* out, word32 sz, const byte* i,
         "eor	x13, x13, x17\n\t"
         "sub	w27, %w[nr], #2\n\t"
         "\n"
-    "L_AES_XTS_decrypt_loop_nr_partial_2_%=: \n\t"
+    "L_AES_XTS_decrypt_loop_nr_partial_2_%=:\n\t"
         "ubfx	x16, x13, #48, #8\n\t"
         "ubfx	x20, x12, #24, #8\n\t"
         "ubfx	x21, x13, #8, #8\n\t"
@@ -56756,7 +56764,7 @@ void AES_XTS_decrypt(const byte* in, byte* out, word32 sz, const byte* i,
         "eor	x13, x13, x24\n\t"
         "stp	x12, x13, [%x[out]]\n\t"
         "\n"
-    "L_AES_XTS_decrypt_done_data_%=: \n\t"
+    "L_AES_XTS_decrypt_done_data_%=:\n\t"
         "ldp	x29, x30, [sp], #32\n\t"
         : [out] "+r" (out), [sz] "+r" (sz), [key] "+r" (key),
           [key2] "+r" (key2), [tmp] "+r" (tmp), [nr] "+r" (nr)
diff --git a/wolfcrypt/src/port/arm/armv8-chacha-asm.S b/wolfcrypt/src/port/arm/armv8-chacha-asm.S
index 1bc3a294e8d..93e9d8e635c 100644
--- a/wolfcrypt/src/port/arm/armv8-chacha-asm.S
+++ b/wolfcrypt/src/port/arm/armv8-chacha-asm.S
@@ -32,40 +32,36 @@
 #ifdef HAVE_CHACHA
 #ifndef __APPLE__
 	.text
-	.type	L_chacha20_arm64_ctr, %object
 	.section	.rodata
+	.type	L_chacha20_arm64_ctr, %object
 	.size	L_chacha20_arm64_ctr, 16
 #else
 	.section	__DATA,__data
 #endif /* __APPLE__ */
+	# 8-byte aligned, 64-bit aligned
 #ifndef __APPLE__
 	.align	3
 #else
 	.p2align	3
 #endif /* __APPLE__ */
 L_chacha20_arm64_ctr:
-	.word	0x00000000
-	.word	0x00000001
-	.word	0x00000002
-	.word	0x00000003
+	.long	0x00000000,0x00000001,0x00000002,0x00000003
 #ifndef __APPLE__
 	.text
-	.type	L_chacha20_arm64_rol8, %object
 	.section	.rodata
+	.type	L_chacha20_arm64_rol8, %object
 	.size	L_chacha20_arm64_rol8, 16
 #else
 	.section	__DATA,__data
 #endif /* __APPLE__ */
+	# 8-byte aligned, 64-bit aligned
 #ifndef __APPLE__
 	.align	3
 #else
 	.p2align	3
 #endif /* __APPLE__ */
 L_chacha20_arm64_rol8:
-	.word	0x02010003
-	.word	0x06050407
-	.word	0x0a09080b
-	.word	0x0e0d0c0f
+	.long	0x02010003,0x06050407,0x0a09080b,0x0e0d0c0f
 #ifndef WOLFSSL_ARMASM_NO_NEON
 #ifndef __APPLE__
 .text
@@ -1009,26 +1005,21 @@ _wc_chacha_setiv:
 #endif /* __APPLE__ */
 #ifndef __APPLE__
 	.text
-	.type	L_chacha_setkey_arm64_constant, %object
 	.section	.rodata
+	.type	L_chacha_setkey_arm64_constant, %object
 	.size	L_chacha_setkey_arm64_constant, 32
 #else
 	.section	__DATA,__data
 #endif /* __APPLE__ */
+	# 8-byte aligned, 64-bit aligned
 #ifndef __APPLE__
 	.align	3
 #else
 	.p2align	3
 #endif /* __APPLE__ */
 L_chacha_setkey_arm64_constant:
-	.word	0x61707865
-	.word	0x3120646e
-	.word	0x79622d36
-	.word	0x6b206574
-	.word	0x61707865
-	.word	0x3320646e
-	.word	0x79622d32
-	.word	0x6b206574
+	.long	0x61707865,0x3120646e,0x79622d36,0x6b206574
+	.long	0x61707865,0x3320646e,0x79622d32,0x6b206574
 #ifndef __APPLE__
 .text
 .globl	wc_chacha_setkey
diff --git a/wolfcrypt/src/port/arm/armv8-chacha-asm_c.c b/wolfcrypt/src/port/arm/armv8-chacha-asm_c.c
index e9720680f27..e440bdee643 100644
--- a/wolfcrypt/src/port/arm/armv8-chacha-asm_c.c
+++ b/wolfcrypt/src/port/arm/armv8-chacha-asm_c.c
@@ -33,11 +33,11 @@
 #ifdef HAVE_CHACHA
 #include <wolfssl/wolfcrypt/chacha.h>
 
-static const word32 L_chacha20_arm64_ctr[] = {
+XALIGNED(8) static const word32 L_chacha20_arm64_ctr[] = {
     0x00000000, 0x00000001, 0x00000002, 0x00000003,
 };
 
-static const word32 L_chacha20_arm64_rol8[] = {
+XALIGNED(8) static const word32 L_chacha20_arm64_rol8[] = {
     0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f,
 };
 
@@ -62,7 +62,7 @@ void wc_chacha_crypt_bytes(ChaCha* ctx, byte* c, const byte* m, word32 len)
         "b.lt	L_chacha_crypt_bytes_arm64_lt_320_%=\n\t"
         "mov	w25, #4\n\t"
         "\n"
-    "L_chacha_crypt_bytes_arm64_loop_320_%=: \n\t"
+    "L_chacha_crypt_bytes_arm64_loop_320_%=:\n\t"
         /* Move state into regular register */
         "mov	x8, v16.d[0]\n\t"
         "mov	x10, v16.d[1]\n\t"
@@ -104,7 +104,7 @@ void wc_chacha_crypt_bytes(ChaCha* ctx, byte* c, const byte* m, word32 len)
         /* Set number of odd+even rounds to perform */
         "mov	x26, #10\n\t"
         "\n"
-    "L_chacha_crypt_bytes_arm64_round_start_320_%=: \n\t"
+    "L_chacha_crypt_bytes_arm64_round_start_320_%=:\n\t"
         "subs	x26, x26, #1\n\t"
         /* Round odd */
         /* a += b; d ^= a; d <<<= 16; */
@@ -436,7 +436,7 @@ void wc_chacha_crypt_bytes(ChaCha* ctx, byte* c, const byte* m, word32 len)
         "b.ge	L_chacha_crypt_bytes_arm64_loop_320_%=\n\t"
         /* Done doing 320 bytes at a time */
         "\n"
-    "L_chacha_crypt_bytes_arm64_lt_320_%=: \n\t"
+    "L_chacha_crypt_bytes_arm64_lt_320_%=:\n\t"
         "cmp	%w[len], #0x100\n\t"
         "b.lt	L_chacha_crypt_bytes_arm64_lt_256_%=\n\t"
         /* Move state into vector registers */
@@ -461,7 +461,7 @@ void wc_chacha_crypt_bytes(ChaCha* ctx, byte* c, const byte* m, word32 len)
         /* Set number of odd+even rounds to perform */
         "mov	x26, #10\n\t"
         "\n"
-    "L_chacha_crypt_bytes_arm64_round_start_256_%=: \n\t"
+    "L_chacha_crypt_bytes_arm64_round_start_256_%=:\n\t"
         "subs	x26, x26, #1\n\t"
         /* Round odd */
         /* a += b; d ^= a; d <<<= 16; */
@@ -669,7 +669,7 @@ void wc_chacha_crypt_bytes(ChaCha* ctx, byte* c, const byte* m, word32 len)
         "add	v19.4s, v19.4s, v29.4s\n\t"
         /* Done 256-byte block */
         "\n"
-    "L_chacha_crypt_bytes_arm64_lt_256_%=: \n\t"
+    "L_chacha_crypt_bytes_arm64_lt_256_%=:\n\t"
         "cmp	%w[len], #0x80\n\t"
         "b.lt	L_chacha_crypt_bytes_arm64_lt_128_%=\n\t"
         "ld1	{v24.16b, v25.16b, v26.16b, v27.16b}, [%x[m]], #0x40\n\t"
@@ -687,7 +687,7 @@ void wc_chacha_crypt_bytes(ChaCha* ctx, byte* c, const byte* m, word32 len)
         /* Set number of odd+even rounds to perform */
         "mov	x26, #10\n\t"
         "\n"
-    "L_chacha_crypt_bytes_arm64_round_start_128_%=: \n\t"
+    "L_chacha_crypt_bytes_arm64_round_start_128_%=:\n\t"
         "subs	x26, x26, #1\n\t"
         /* Round odd */
         /* a += b; d ^= a; d <<<= 16; */
@@ -793,12 +793,12 @@ void wc_chacha_crypt_bytes(ChaCha* ctx, byte* c, const byte* m, word32 len)
         "sub	%w[len], %w[len], #0x80\n\t"
         /* Done 128-byte block */
         "\n"
-    "L_chacha_crypt_bytes_arm64_lt_128_%=: \n\t"
+    "L_chacha_crypt_bytes_arm64_lt_128_%=:\n\t"
         "cmp	%w[len], #0\n\t"
         "b.eq	L_chacha_crypt_bytes_arm64_done_all_%=\n\t"
         "mov	%w[rol8], #0x40\n\t"
         "\n"
-    "L_chacha_crypt_bytes_arm64_loop_64_%=: \n\t"
+    "L_chacha_crypt_bytes_arm64_loop_64_%=:\n\t"
         /* Move state into vector registers */
         "mov	v0.16b, v16.16b\n\t"
         "mov	v1.16b, v17.16b\n\t"
@@ -807,7 +807,7 @@ void wc_chacha_crypt_bytes(ChaCha* ctx, byte* c, const byte* m, word32 len)
         /* Set number of odd+even rounds to perform */
         "mov	x26, #10\n\t"
         "\n"
-    "L_chacha_crypt_bytes_arm64_round_64_%=: \n\t"
+    "L_chacha_crypt_bytes_arm64_round_64_%=:\n\t"
         "subs	x26, x26, #1\n\t"
         /* Round odd */
         /* a += b; d ^= a; d <<<= 16; */
@@ -875,7 +875,7 @@ void wc_chacha_crypt_bytes(ChaCha* ctx, byte* c, const byte* m, word32 len)
         "b.ne	L_chacha_crypt_bytes_arm64_loop_64_%=\n\t"
         "b	L_chacha_crypt_bytes_arm64_done_%=\n\t"
         "\n"
-    "L_chacha_crypt_bytes_arm64_lt_64_%=: \n\t"
+    "L_chacha_crypt_bytes_arm64_lt_64_%=:\n\t"
         /* Calculate bytes left in block not used */
         "sub	%w[rol8], %w[rol8], %w[len]\n\t"
         /* Store encipher block in over for further operations and left */
@@ -893,7 +893,7 @@ void wc_chacha_crypt_bytes(ChaCha* ctx, byte* c, const byte* m, word32 len)
         "mov	v1.16b, v3.16b\n\t"
         "b.eq	L_chacha_crypt_bytes_arm64_done_%=\n\t"
         "\n"
-    "L_chacha_crypt_bytes_arm64_lt_32_%=: \n\t"
+    "L_chacha_crypt_bytes_arm64_lt_32_%=:\n\t"
         "cmp	%w[len], #16\n\t"
         "b.lt	L_chacha_crypt_bytes_arm64_lt_16_%=\n\t"
         /* Encipher 16 bytes */
@@ -904,7 +904,7 @@ void wc_chacha_crypt_bytes(ChaCha* ctx, byte* c, const byte* m, word32 len)
         "mov	v0.16b, v1.16b\n\t"
         "b.eq	L_chacha_crypt_bytes_arm64_done_%=\n\t"
         "\n"
-    "L_chacha_crypt_bytes_arm64_lt_16_%=: \n\t"
+    "L_chacha_crypt_bytes_arm64_lt_16_%=:\n\t"
         "cmp	%w[len], #8\n\t"
         "b.lt	L_chacha_crypt_bytes_arm64_lt_8_%=\n\t"
         /* Encipher 8 bytes */
@@ -915,10 +915,10 @@ void wc_chacha_crypt_bytes(ChaCha* ctx, byte* c, const byte* m, word32 len)
         "mov	v0.d[0], v0.d[1]\n\t"
         "b.eq	L_chacha_crypt_bytes_arm64_done_%=\n\t"
         "\n"
-    "L_chacha_crypt_bytes_arm64_lt_8_%=: \n\t"
+    "L_chacha_crypt_bytes_arm64_lt_8_%=:\n\t"
         "mov	%[rol8], v0.d[0]\n\t"
         "\n"
-    "L_chacha_crypt_bytes_arm64_loop_lt_8_%=: \n\t"
+    "L_chacha_crypt_bytes_arm64_loop_lt_8_%=:\n\t"
         /* Encipher 1 byte at a time */
         "ldrb	%w[ctr], [%x[m]], #1\n\t"
         "eor	%w[ctr], %w[ctr], %w[rol8]\n\t"
@@ -927,9 +927,9 @@ void wc_chacha_crypt_bytes(ChaCha* ctx, byte* c, const byte* m, word32 len)
         "lsr	%[rol8], %[rol8], #8\n\t"
         "b.gt	L_chacha_crypt_bytes_arm64_loop_lt_8_%=\n\t"
         "\n"
-    "L_chacha_crypt_bytes_arm64_done_%=: \n\t"
+    "L_chacha_crypt_bytes_arm64_done_%=:\n\t"
         "\n"
-    "L_chacha_crypt_bytes_arm64_done_all_%=: \n\t"
+    "L_chacha_crypt_bytes_arm64_done_all_%=:\n\t"
         "st1	{v16.4s, v17.4s, v18.4s, v19.4s}, [%x[ctx]]\n\t"
         : [ctx] "+r" (ctx), [c] "+r" (c), [len] "+r" (len)
         : [m] "r" (m), [rol8] "r" (rol8), [ctr] "r" (ctr)
@@ -956,7 +956,7 @@ void wc_chacha_setiv(word32* x, const byte* iv, word32 counter)
     );
 }
 
-static const word32 L_chacha_setkey_arm64_constant[] = {
+XALIGNED(8) static const word32 L_chacha_setkey_arm64_constant[] = {
     0x61707865, 0x3120646e, 0x79622d36, 0x6b206574,
     0x61707865, 0x3320646e, 0x79622d32, 0x6b206574,
 };
@@ -981,7 +981,7 @@ void wc_chacha_setkey(word32* x, const byte* key, word32 keySz)
         "rev32	v1.8h, v1.8h\n\t"
 #endif /* BIG_ENDIAN_ORDER */
         "\n"
-    "L_chacha_setkey_arm64_done_%=: \n\t"
+    "L_chacha_setkey_arm64_done_%=:\n\t"
         "st1	{v1.4s}, [%x[x]]\n\t"
         : [x] "+r" (x), [keySz] "+r" (keySz)
         : [key] "r" (key), [constant] "r" (constant)
@@ -993,7 +993,7 @@ void wc_chacha_use_over(byte* over, byte* output, const byte* input, word32 len)
 {
     __asm__ __volatile__ (
         "\n"
-    "L_chacha_use_over_arm64_16byte_loop_%=: \n\t"
+    "L_chacha_use_over_arm64_16byte_loop_%=:\n\t"
         "cmp	%w[len], #16\n\t"
         "b.lt	L_chacha_use_over_arm64_word_loop_%=\n\t"
         /* 16 bytes of state XORed into message. */
@@ -1005,7 +1005,7 @@ void wc_chacha_use_over(byte* over, byte* output, const byte* input, word32 len)
         "b.eq	L_chacha_use_over_arm64_done_%=\n\t"
         "b	L_chacha_use_over_arm64_16byte_loop_%=\n\t"
         "\n"
-    "L_chacha_use_over_arm64_word_loop_%=: \n\t"
+    "L_chacha_use_over_arm64_word_loop_%=:\n\t"
         "cmp	%w[len], #4\n\t"
         "b.lt	L_chacha_use_over_arm64_byte_loop_%=\n\t"
         /* 4 bytes of state XORed into message. */
@@ -1017,7 +1017,7 @@ void wc_chacha_use_over(byte* over, byte* output, const byte* input, word32 len)
         "b.eq	L_chacha_use_over_arm64_done_%=\n\t"
         "b	L_chacha_use_over_arm64_word_loop_%=\n\t"
         "\n"
-    "L_chacha_use_over_arm64_byte_loop_%=: \n\t"
+    "L_chacha_use_over_arm64_byte_loop_%=:\n\t"
         /* 1 bytes of state XORed into message. */
         "ldrb	w4, [%x[over]], #1\n\t"
         "ldrb	w5, [%x[input]], #1\n\t"
@@ -1027,7 +1027,7 @@ void wc_chacha_use_over(byte* over, byte* output, const byte* input, word32 len)
         "b.eq	L_chacha_use_over_arm64_done_%=\n\t"
         "b	L_chacha_use_over_arm64_byte_loop_%=\n\t"
         "\n"
-    "L_chacha_use_over_arm64_done_%=: \n\t"
+    "L_chacha_use_over_arm64_done_%=:\n\t"
         : [over] "+r" (over), [output] "+r" (output), [len] "+r" (len)
         : [input] "r" (input)
         : "memory", "cc", "x4", "x5", "v0", "v1"
diff --git a/wolfcrypt/src/port/arm/armv8-curve25519.S b/wolfcrypt/src/port/arm/armv8-curve25519.S
index fd7c30f3474..6f88bd5e19f 100644
--- a/wolfcrypt/src/port/arm/armv8-curve25519.S
+++ b/wolfcrypt/src/port/arm/armv8-curve25519.S
@@ -1941,20 +1941,21 @@ L_fe_invert8:
 #if !defined(HAVE_ED25519) && !defined(WOLFSSL_CURVE25519_USE_ED25519)
 #ifndef __APPLE__
 	.text
-	.type	L_curve25519_base_x2, %object
 	.section	.rodata
+	.type	L_curve25519_base_x2, %object
 	.size	L_curve25519_base_x2, 32
 #else
 	.section	__DATA,__data
 #endif /* __APPLE__ */
+	# 16-byte aligned, 128-bit aligned
 #ifndef __APPLE__
-	.align	5
+	.align	4
 #else
-	.p2align	5
+	.p2align	4
 #endif /* __APPLE__ */
 L_curve25519_base_x2:
-.xword	0x5cae469cdd684efb, 0x8f3f5ced1e350b5c
-.xword	0xd9750c687d157114, 0x20d342d51873f1b7
+	.quad	0x5cae469cdd684efb,0x8f3f5ced1e350b5c
+	.quad	0xd9750c687d157114,0x20d342d51873f1b7
 #ifndef __APPLE__
 .text
 .globl	curve25519_base
diff --git a/wolfcrypt/src/port/arm/armv8-curve25519_c.c b/wolfcrypt/src/port/arm/armv8-curve25519_c.c
index 6867ed3ca75..b8515b00549 100644
--- a/wolfcrypt/src/port/arm/armv8-curve25519_c.c
+++ b/wolfcrypt/src/port/arm/armv8-curve25519_c.c
@@ -501,7 +501,7 @@ void fe_invert_nct(fe r, const fe a)
         "sub	x23, x24, x23\n\t"
         "b	L_fe_invert_nct_num_bits_init_v_3_%=\n\t"
         "\n"
-    "L_fe_invert_nct_num_bits_init_v_0_%=: \n\t"
+    "L_fe_invert_nct_num_bits_init_v_0_%=:\n\t"
         "cmp	x8, #0\n\t"
         "b.eq	L_fe_invert_nct_num_bits_init_v_1_%=\n\t"
         "mov	x24, #0xc0\n\t"
@@ -509,7 +509,7 @@ void fe_invert_nct(fe r, const fe a)
         "sub	x23, x24, x23\n\t"
         "b	L_fe_invert_nct_num_bits_init_v_3_%=\n\t"
         "\n"
-    "L_fe_invert_nct_num_bits_init_v_1_%=: \n\t"
+    "L_fe_invert_nct_num_bits_init_v_1_%=:\n\t"
         "cmp	x7, #0\n\t"
         "b.eq	L_fe_invert_nct_num_bits_init_v_2_%=\n\t"
         "mov	x24, #0x80\n\t"
@@ -517,16 +517,16 @@ void fe_invert_nct(fe r, const fe a)
         "sub	x23, x24, x23\n\t"
         "b	L_fe_invert_nct_num_bits_init_v_3_%=\n\t"
         "\n"
-    "L_fe_invert_nct_num_bits_init_v_2_%=: \n\t"
+    "L_fe_invert_nct_num_bits_init_v_2_%=:\n\t"
         "mov	x24, #0x40\n\t"
         "clz	x23, x6\n\t"
         "sub	x23, x24, x23\n\t"
         "\n"
-    "L_fe_invert_nct_num_bits_init_v_3_%=: \n\t"
+    "L_fe_invert_nct_num_bits_init_v_3_%=:\n\t"
         "tst	x6, #1\n\t"
         "b.ne	L_fe_invert_nct_loop_%=\n\t"
         "\n"
-    "L_fe_invert_nct_even_init_v_0_%=: \n\t"
+    "L_fe_invert_nct_even_init_v_0_%=:\n\t"
         "extr	x6, x7, x6, #1\n\t"
         "extr	x7, x8, x7, #1\n\t"
         "extr	x8, x9, x8, #1\n\t"
@@ -540,7 +540,7 @@ void fe_invert_nct(fe r, const fe a)
         "adcs	x17, x17, x21\n\t"
         "cset	x24, cs\n\t"
         "\n"
-    "L_fe_invert_nct_even_init_v_1_%=: \n\t"
+    "L_fe_invert_nct_even_init_v_1_%=:\n\t"
         "extr	x14, x15, x14, #1\n\t"
         "extr	x15, x16, x15, #1\n\t"
         "extr	x16, x17, x16, #1\n\t"
@@ -548,7 +548,7 @@ void fe_invert_nct(fe r, const fe a)
         "tst	x6, #1\n\t"
         "b.eq	L_fe_invert_nct_even_init_v_0_%=\n\t"
         "\n"
-    "L_fe_invert_nct_loop_%=: \n\t"
+    "L_fe_invert_nct_loop_%=:\n\t"
         "cmp	x22, #1\n\t"
         "b.eq	L_fe_invert_nct_u_done_%=\n\t"
         "cmp	x23, #1\n\t"
@@ -568,7 +568,7 @@ void fe_invert_nct(fe r, const fe a)
         "cmp	x2, x6\n\t"
         "bcc	L_fe_invert_nct_v_larger_%=\n\t"
         "\n"
-    "L_fe_invert_nct_u_larger_%=: \n\t"
+    "L_fe_invert_nct_u_larger_%=:\n\t"
         "subs	x2, x2, x6\n\t"
         "sbcs	x3, x3, x7\n\t"
         "sbcs	x4, x4, x8\n\t"
@@ -583,7 +583,7 @@ void fe_invert_nct(fe r, const fe a)
         "adcs	x12, x12, x20\n\t"
         "adc	x13, x13, x21\n\t"
         "\n"
-    "L_fe_invert_nct_sub_uv_%=: \n\t"
+    "L_fe_invert_nct_sub_uv_%=:\n\t"
         "cmp	x5, #0\n\t"
         "b.eq	L_fe_invert_nct_nct_num_bits_u_0_%=\n\t"
         "mov	x24, #0x100\n\t"
@@ -591,7 +591,7 @@ void fe_invert_nct(fe r, const fe a)
         "sub	x22, x24, x22\n\t"
         "b	L_fe_invert_nct_nct_num_bits_u_3_%=\n\t"
         "\n"
-    "L_fe_invert_nct_nct_num_bits_u_0_%=: \n\t"
+    "L_fe_invert_nct_nct_num_bits_u_0_%=:\n\t"
         "cmp	x4, #0\n\t"
         "b.eq	L_fe_invert_nct_nct_num_bits_u_1_%=\n\t"
         "mov	x24, #0xc0\n\t"
@@ -599,7 +599,7 @@ void fe_invert_nct(fe r, const fe a)
         "sub	x22, x24, x22\n\t"
         "b	L_fe_invert_nct_nct_num_bits_u_3_%=\n\t"
         "\n"
-    "L_fe_invert_nct_nct_num_bits_u_1_%=: \n\t"
+    "L_fe_invert_nct_nct_num_bits_u_1_%=:\n\t"
         "cmp	x3, #0\n\t"
         "b.eq	L_fe_invert_nct_nct_num_bits_u_2_%=\n\t"
         "mov	x24, #0x80\n\t"
@@ -607,14 +607,14 @@ void fe_invert_nct(fe r, const fe a)
         "sub	x22, x24, x22\n\t"
         "b	L_fe_invert_nct_nct_num_bits_u_3_%=\n\t"
         "\n"
-    "L_fe_invert_nct_nct_num_bits_u_2_%=: \n\t"
+    "L_fe_invert_nct_nct_num_bits_u_2_%=:\n\t"
         "mov	x24, #0x40\n\t"
         "clz	x22, x2\n\t"
         "sub	x22, x24, x22\n\t"
         "\n"
-    "L_fe_invert_nct_nct_num_bits_u_3_%=: \n\t"
+    "L_fe_invert_nct_nct_num_bits_u_3_%=:\n\t"
         "\n"
-    "L_fe_invert_nct_even_u_0_%=: \n\t"
+    "L_fe_invert_nct_even_u_0_%=:\n\t"
         "extr	x2, x3, x2, #1\n\t"
         "extr	x3, x4, x3, #1\n\t"
         "extr	x4, x5, x4, #1\n\t"
@@ -628,7 +628,7 @@ void fe_invert_nct(fe r, const fe a)
         "adcs	x13, x13, x21\n\t"
         "cset	x24, cs\n\t"
         "\n"
-    "L_fe_invert_nct_even_u_1_%=: \n\t"
+    "L_fe_invert_nct_even_u_1_%=:\n\t"
         "extr	x10, x11, x10, #1\n\t"
         "extr	x11, x12, x11, #1\n\t"
         "extr	x12, x13, x12, #1\n\t"
@@ -637,7 +637,7 @@ void fe_invert_nct(fe r, const fe a)
         "b.eq	L_fe_invert_nct_even_u_0_%=\n\t"
         "b	L_fe_invert_nct_loop_%=\n\t"
         "\n"
-    "L_fe_invert_nct_v_larger_%=: \n\t"
+    "L_fe_invert_nct_v_larger_%=:\n\t"
         "subs	x6, x6, x2\n\t"
         "sbcs	x7, x7, x3\n\t"
         "sbcs	x8, x8, x4\n\t"
@@ -652,7 +652,7 @@ void fe_invert_nct(fe r, const fe a)
         "adcs	x16, x16, x20\n\t"
         "adc	x17, x17, x21\n\t"
         "\n"
-    "L_fe_invert_nct_sub_vu_%=: \n\t"
+    "L_fe_invert_nct_sub_vu_%=:\n\t"
         "cmp	x9, #0\n\t"
         "b.eq	L_fe_invert_nct_nct_num_bits_v_0_%=\n\t"
         "mov	x24, #0x100\n\t"
@@ -660,7 +660,7 @@ void fe_invert_nct(fe r, const fe a)
         "sub	x23, x24, x23\n\t"
         "b	L_fe_invert_nct_nct_num_bits_v_3_%=\n\t"
         "\n"
-    "L_fe_invert_nct_nct_num_bits_v_0_%=: \n\t"
+    "L_fe_invert_nct_nct_num_bits_v_0_%=:\n\t"
         "cmp	x8, #0\n\t"
         "b.eq	L_fe_invert_nct_nct_num_bits_v_1_%=\n\t"
         "mov	x24, #0xc0\n\t"
@@ -668,7 +668,7 @@ void fe_invert_nct(fe r, const fe a)
         "sub	x23, x24, x23\n\t"
         "b	L_fe_invert_nct_nct_num_bits_v_3_%=\n\t"
         "\n"
-    "L_fe_invert_nct_nct_num_bits_v_1_%=: \n\t"
+    "L_fe_invert_nct_nct_num_bits_v_1_%=:\n\t"
         "cmp	x7, #0\n\t"
         "b.eq	L_fe_invert_nct_nct_num_bits_v_2_%=\n\t"
         "mov	x24, #0x80\n\t"
@@ -676,14 +676,14 @@ void fe_invert_nct(fe r, const fe a)
         "sub	x23, x24, x23\n\t"
         "b	L_fe_invert_nct_nct_num_bits_v_3_%=\n\t"
         "\n"
-    "L_fe_invert_nct_nct_num_bits_v_2_%=: \n\t"
+    "L_fe_invert_nct_nct_num_bits_v_2_%=:\n\t"
         "mov	x24, #0x40\n\t"
         "clz	x23, x6\n\t"
         "sub	x23, x24, x23\n\t"
         "\n"
-    "L_fe_invert_nct_nct_num_bits_v_3_%=: \n\t"
+    "L_fe_invert_nct_nct_num_bits_v_3_%=:\n\t"
         "\n"
-    "L_fe_invert_nct_even_v_0_%=: \n\t"
+    "L_fe_invert_nct_even_v_0_%=:\n\t"
         "extr	x6, x7, x6, #1\n\t"
         "extr	x7, x8, x7, #1\n\t"
         "extr	x8, x9, x8, #1\n\t"
@@ -697,7 +697,7 @@ void fe_invert_nct(fe r, const fe a)
         "adcs	x17, x17, x21\n\t"
         "cset	x24, cs\n\t"
         "\n"
-    "L_fe_invert_nct_even_v_1_%=: \n\t"
+    "L_fe_invert_nct_even_v_1_%=:\n\t"
         "extr	x14, x15, x14, #1\n\t"
         "extr	x15, x16, x15, #1\n\t"
         "extr	x16, x17, x16, #1\n\t"
@@ -706,20 +706,20 @@ void fe_invert_nct(fe r, const fe a)
         "b.eq	L_fe_invert_nct_even_v_0_%=\n\t"
         "b	L_fe_invert_nct_loop_%=\n\t"
         "\n"
-    "L_fe_invert_nct_u_done_%=: \n\t"
+    "L_fe_invert_nct_u_done_%=:\n\t"
         "str	x10, [%x[r]]\n\t"
         "str	x11, [%x[r], #8]\n\t"
         "str	x12, [%x[r], #16]\n\t"
         "str	x13, [%x[r], #24]\n\t"
         "b	L_fe_invert_nct_done_%=\n\t"
         "\n"
-    "L_fe_invert_nct_v_done_%=: \n\t"
+    "L_fe_invert_nct_v_done_%=:\n\t"
         "str	x14, [%x[r]]\n\t"
         "str	x15, [%x[r], #8]\n\t"
         "str	x16, [%x[r], #16]\n\t"
         "str	x17, [%x[r], #24]\n\t"
         "\n"
-    "L_fe_invert_nct_done_%=: \n\t"
+    "L_fe_invert_nct_done_%=:\n\t"
         : [r] "+r" (r)
         : [a] "r" (a)
         : "memory", "cc", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10",
@@ -1041,7 +1041,7 @@ void fe_invert(fe r, const fe a)
         "ldp	x6, x7, [x29, #48]\n\t"
         "ldp	x8, x9, [x29, #64]\n\t"
         "\n"
-    "L_fe_invert1_%=: \n\t"
+    "L_fe_invert1_%=:\n\t"
         /* Square */
         /*  A[0] * A[1] */
         "umulh	x12, x6, x7\n\t"
@@ -1142,7 +1142,7 @@ void fe_invert(fe r, const fe a)
         "ldp	x6, x7, [x29, #48]\n\t"
         "ldp	x8, x9, [x29, #64]\n\t"
         "\n"
-    "L_fe_invert2_%=: \n\t"
+    "L_fe_invert2_%=:\n\t"
         /* Square */
         /*  A[0] * A[1] */
         "umulh	x12, x6, x7\n\t"
@@ -1243,7 +1243,7 @@ void fe_invert(fe r, const fe a)
         "ldp	x6, x7, [x29, #80]\n\t"
         "ldp	x8, x9, [x29, #96]\n\t"
         "\n"
-    "L_fe_invert3_%=: \n\t"
+    "L_fe_invert3_%=:\n\t"
         /* Square */
         /*  A[0] * A[1] */
         "umulh	x12, x6, x7\n\t"
@@ -1344,7 +1344,7 @@ void fe_invert(fe r, const fe a)
         "ldp	x6, x7, [x29, #80]\n\t"
         "ldp	x8, x9, [x29, #96]\n\t"
         "\n"
-    "L_fe_invert4_%=: \n\t"
+    "L_fe_invert4_%=:\n\t"
         /* Square */
         /*  A[0] * A[1] */
         "umulh	x12, x6, x7\n\t"
@@ -1443,7 +1443,7 @@ void fe_invert(fe r, const fe a)
         "ldp	x6, x7, [x29, #48]\n\t"
         "ldp	x8, x9, [x29, #64]\n\t"
         "\n"
-    "L_fe_invert5_%=: \n\t"
+    "L_fe_invert5_%=:\n\t"
         /* Square */
         /*  A[0] * A[1] */
         "umulh	x12, x6, x7\n\t"
@@ -1544,7 +1544,7 @@ void fe_invert(fe r, const fe a)
         "ldp	x6, x7, [x29, #80]\n\t"
         "ldp	x8, x9, [x29, #96]\n\t"
         "\n"
-    "L_fe_invert6_%=: \n\t"
+    "L_fe_invert6_%=:\n\t"
         /* Square */
         /*  A[0] * A[1] */
         "umulh	x12, x6, x7\n\t"
@@ -1645,7 +1645,7 @@ void fe_invert(fe r, const fe a)
         "ldp	x6, x7, [x29, #80]\n\t"
         "ldp	x8, x9, [x29, #96]\n\t"
         "\n"
-    "L_fe_invert7_%=: \n\t"
+    "L_fe_invert7_%=:\n\t"
         /* Square */
         /*  A[0] * A[1] */
         "umulh	x12, x6, x7\n\t"
@@ -1744,7 +1744,7 @@ void fe_invert(fe r, const fe a)
         "ldp	x6, x7, [x29, #48]\n\t"
         "ldp	x8, x9, [x29, #64]\n\t"
         "\n"
-    "L_fe_invert8_%=: \n\t"
+    "L_fe_invert8_%=:\n\t"
         /* Square */
         /*  A[0] * A[1] */
         "umulh	x12, x6, x7\n\t"
@@ -1847,9 +1847,9 @@ void fe_invert(fe r, const fe a)
 }
 
 #if !defined(HAVE_ED25519) && !defined(WOLFSSL_CURVE25519_USE_ED25519)
-static const word64 L_curve25519_base_x2[] = {
-    0x5cae469cdd684efb, 0x8f3f5ced1e350b5c,
-    0xd9750c687d157114, 0x20d342d51873f1b7,
+XALIGNED(16) static const word64 L_curve25519_base_x2[] = {
+    0x5cae469cdd684efbUL, 0x8f3f5ced1e350b5cUL,
+    0xd9750c687d157114UL, 0x20d342d51873f1b7UL,
 };
 
 int curve25519_base(byte* r, const byte* n)
@@ -1876,7 +1876,7 @@ int curve25519_base(byte* r, const byte* n)
         "mov	x23, %x[r]\n\t"
         "mov	x24, #0xfd\n\t"
         "\n"
-    "L_curve25519_base_bits_%=: \n\t"
+    "L_curve25519_base_bits_%=:\n\t"
         "lsr	x3, x24, #6\n\t"
         "and	x4, x24, #63\n\t"
         "ldr	x5, [%x[n], x3, LSL 3]\n\t"
@@ -2885,7 +2885,7 @@ int curve25519_base(byte* r, const byte* n)
         "csel	x17, x13, x9, ne\n\t"
         "csel	x13, x9, x13, ne\n\t"
         "\n"
-    "L_curve25519_base_3_%=: \n\t"
+    "L_curve25519_base_3_%=:\n\t"
         /* Add */
         "adds	x6, x10, x25\n\t"
         "adcs	x7, x11, x26\n\t"
@@ -3427,7 +3427,7 @@ int curve25519_base(byte* r, const byte* n)
         "ldp	x6, x7, [x29, #80]\n\t"
         "ldp	x8, x9, [x29, #96]\n\t"
         "\n"
-    "L_curve25519_base_inv_1_%=: \n\t"
+    "L_curve25519_base_inv_1_%=:\n\t"
         /* Square */
         /*  A[0] * A[1] */
         "umulh	x12, x6, x7\n\t"
@@ -3528,7 +3528,7 @@ int curve25519_base(byte* r, const byte* n)
         "ldp	x6, x7, [x29, #80]\n\t"
         "ldp	x8, x9, [x29, #96]\n\t"
         "\n"
-    "L_curve25519_base_inv_2_%=: \n\t"
+    "L_curve25519_base_inv_2_%=:\n\t"
         /* Square */
         /*  A[0] * A[1] */
         "umulh	x12, x6, x7\n\t"
@@ -3629,7 +3629,7 @@ int curve25519_base(byte* r, const byte* n)
         "ldp	x6, x7, [x29, #112]\n\t"
         "ldp	x8, x9, [x29, #128]\n\t"
         "\n"
-    "L_curve25519_base_inv_3_%=: \n\t"
+    "L_curve25519_base_inv_3_%=:\n\t"
         /* Square */
         /*  A[0] * A[1] */
         "umulh	x12, x6, x7\n\t"
@@ -3730,7 +3730,7 @@ int curve25519_base(byte* r, const byte* n)
         "ldp	x6, x7, [x29, #112]\n\t"
         "ldp	x8, x9, [x29, #128]\n\t"
         "\n"
-    "L_curve25519_base_inv_4_%=: \n\t"
+    "L_curve25519_base_inv_4_%=:\n\t"
         /* Square */
         /*  A[0] * A[1] */
         "umulh	x12, x6, x7\n\t"
@@ -3829,7 +3829,7 @@ int curve25519_base(byte* r, const byte* n)
         "ldp	x6, x7, [x29, #80]\n\t"
         "ldp	x8, x9, [x29, #96]\n\t"
         "\n"
-    "L_curve25519_base_inv_5_%=: \n\t"
+    "L_curve25519_base_inv_5_%=:\n\t"
         /* Square */
         /*  A[0] * A[1] */
         "umulh	x12, x6, x7\n\t"
@@ -3930,7 +3930,7 @@ int curve25519_base(byte* r, const byte* n)
         "ldp	x6, x7, [x29, #112]\n\t"
         "ldp	x8, x9, [x29, #128]\n\t"
         "\n"
-    "L_curve25519_base_inv_6_%=: \n\t"
+    "L_curve25519_base_inv_6_%=:\n\t"
         /* Square */
         /*  A[0] * A[1] */
         "umulh	x12, x6, x7\n\t"
@@ -4031,7 +4031,7 @@ int curve25519_base(byte* r, const byte* n)
         "ldp	x6, x7, [x29, #112]\n\t"
         "ldp	x8, x9, [x29, #128]\n\t"
         "\n"
-    "L_curve25519_base_inv_7_%=: \n\t"
+    "L_curve25519_base_inv_7_%=:\n\t"
         /* Square */
         /*  A[0] * A[1] */
         "umulh	x12, x6, x7\n\t"
@@ -4130,7 +4130,7 @@ int curve25519_base(byte* r, const byte* n)
         "ldp	x6, x7, [x29, #80]\n\t"
         "ldp	x8, x9, [x29, #96]\n\t"
         "\n"
-    "L_curve25519_base_inv_8_%=: \n\t"
+    "L_curve25519_base_inv_8_%=:\n\t"
         /* Square */
         /*  A[0] * A[1] */
         "umulh	x12, x6, x7\n\t"
@@ -4394,7 +4394,7 @@ int curve25519(byte* r, const byte* n, const byte* a)
         "stp	xzr, xzr, [x29, #32]\n\t"
         "mov	x24, #0xfe\n\t"
         "\n"
-    "L_curve25519_bits_%=: \n\t"
+    "L_curve25519_bits_%=:\n\t"
         "lsr	x3, x24, #6\n\t"
         "and	x4, x24, #63\n\t"
         "ldr	x5, [%x[n], x3, LSL 3]\n\t"
@@ -5492,7 +5492,7 @@ int curve25519(byte* r, const byte* n, const byte* a)
         "csel	x17, x13, x9, ne\n\t"
         "csel	x13, x9, x13, ne\n\t"
         "\n"
-    "L_curve25519_3_%=: \n\t"
+    "L_curve25519_3_%=:\n\t"
         /* Add */
         "adds	x6, x10, x25\n\t"
         "adcs	x7, x11, x26\n\t"
@@ -6034,7 +6034,7 @@ int curve25519(byte* r, const byte* n, const byte* a)
         "ldp	x6, x7, [x29, #80]\n\t"
         "ldp	x8, x9, [x29, #96]\n\t"
         "\n"
-    "L_curve25519_inv_1_%=: \n\t"
+    "L_curve25519_inv_1_%=:\n\t"
         /* Square */
         /*  A[0] * A[1] */
         "umulh	x12, x6, x7\n\t"
@@ -6135,7 +6135,7 @@ int curve25519(byte* r, const byte* n, const byte* a)
         "ldp	x6, x7, [x29, #80]\n\t"
         "ldp	x8, x9, [x29, #96]\n\t"
         "\n"
-    "L_curve25519_inv_2_%=: \n\t"
+    "L_curve25519_inv_2_%=:\n\t"
         /* Square */
         /*  A[0] * A[1] */
         "umulh	x12, x6, x7\n\t"
@@ -6236,7 +6236,7 @@ int curve25519(byte* r, const byte* n, const byte* a)
         "ldp	x6, x7, [x29, #112]\n\t"
         "ldp	x8, x9, [x29, #128]\n\t"
         "\n"
-    "L_curve25519_inv_3_%=: \n\t"
+    "L_curve25519_inv_3_%=:\n\t"
         /* Square */
         /*  A[0] * A[1] */
         "umulh	x12, x6, x7\n\t"
@@ -6337,7 +6337,7 @@ int curve25519(byte* r, const byte* n, const byte* a)
         "ldp	x6, x7, [x29, #112]\n\t"
         "ldp	x8, x9, [x29, #128]\n\t"
         "\n"
-    "L_curve25519_inv_4_%=: \n\t"
+    "L_curve25519_inv_4_%=:\n\t"
         /* Square */
         /*  A[0] * A[1] */
         "umulh	x12, x6, x7\n\t"
@@ -6436,7 +6436,7 @@ int curve25519(byte* r, const byte* n, const byte* a)
         "ldp	x6, x7, [x29, #80]\n\t"
         "ldp	x8, x9, [x29, #96]\n\t"
         "\n"
-    "L_curve25519_inv_5_%=: \n\t"
+    "L_curve25519_inv_5_%=:\n\t"
         /* Square */
         /*  A[0] * A[1] */
         "umulh	x12, x6, x7\n\t"
@@ -6537,7 +6537,7 @@ int curve25519(byte* r, const byte* n, const byte* a)
         "ldp	x6, x7, [x29, #112]\n\t"
         "ldp	x8, x9, [x29, #128]\n\t"
         "\n"
-    "L_curve25519_inv_6_%=: \n\t"
+    "L_curve25519_inv_6_%=:\n\t"
         /* Square */
         /*  A[0] * A[1] */
         "umulh	x12, x6, x7\n\t"
@@ -6638,7 +6638,7 @@ int curve25519(byte* r, const byte* n, const byte* a)
         "ldp	x6, x7, [x29, #112]\n\t"
         "ldp	x8, x9, [x29, #128]\n\t"
         "\n"
-    "L_curve25519_inv_7_%=: \n\t"
+    "L_curve25519_inv_7_%=:\n\t"
         /* Square */
         /*  A[0] * A[1] */
         "umulh	x12, x6, x7\n\t"
@@ -6737,7 +6737,7 @@ int curve25519(byte* r, const byte* n, const byte* a)
         "ldp	x6, x7, [x29, #80]\n\t"
         "ldp	x8, x9, [x29, #96]\n\t"
         "\n"
-    "L_curve25519_inv_8_%=: \n\t"
+    "L_curve25519_inv_8_%=:\n\t"
         /* Square */
         /*  A[0] * A[1] */
         "umulh	x12, x6, x7\n\t"
@@ -7057,7 +7057,7 @@ void fe_pow22523(fe r, const fe a)
         "ldp	x6, x7, [x29, #16]\n\t"
         "ldp	x8, x9, [x29, #32]\n\t"
         "\n"
-    "L_fe_pow22523_1_%=: \n\t"
+    "L_fe_pow22523_1_%=:\n\t"
         /* Square */
         /*  A[0] * A[1] */
         "umulh	x12, x6, x7\n\t"
@@ -7160,7 +7160,7 @@ void fe_pow22523(fe r, const fe a)
         "ldp	x6, x7, [x29, #16]\n\t"
         "ldp	x8, x9, [x29, #32]\n\t"
         "\n"
-    "L_fe_pow22523_2_%=: \n\t"
+    "L_fe_pow22523_2_%=:\n\t"
         /* Square */
         /*  A[0] * A[1] */
         "umulh	x12, x6, x7\n\t"
@@ -7261,7 +7261,7 @@ void fe_pow22523(fe r, const fe a)
         "ldp	x6, x7, [x29, #48]\n\t"
         "ldp	x8, x9, [x29, #64]\n\t"
         "\n"
-    "L_fe_pow22523_3_%=: \n\t"
+    "L_fe_pow22523_3_%=:\n\t"
         /* Square */
         /*  A[0] * A[1] */
         "umulh	x12, x6, x7\n\t"
@@ -7362,7 +7362,7 @@ void fe_pow22523(fe r, const fe a)
         "ldp	x6, x7, [x29, #48]\n\t"
         "ldp	x8, x9, [x29, #64]\n\t"
         "\n"
-    "L_fe_pow22523_4_%=: \n\t"
+    "L_fe_pow22523_4_%=:\n\t"
         /* Square */
         /*  A[0] * A[1] */
         "umulh	x12, x6, x7\n\t"
@@ -7461,7 +7461,7 @@ void fe_pow22523(fe r, const fe a)
         "ldp	x6, x7, [x29, #16]\n\t"
         "ldp	x8, x9, [x29, #32]\n\t"
         "\n"
-    "L_fe_pow22523_5_%=: \n\t"
+    "L_fe_pow22523_5_%=:\n\t"
         /* Square */
         /*  A[0] * A[1] */
         "umulh	x12, x6, x7\n\t"
@@ -7562,7 +7562,7 @@ void fe_pow22523(fe r, const fe a)
         "ldp	x6, x7, [x29, #48]\n\t"
         "ldp	x8, x9, [x29, #64]\n\t"
         "\n"
-    "L_fe_pow22523_6_%=: \n\t"
+    "L_fe_pow22523_6_%=:\n\t"
         /* Square */
         /*  A[0] * A[1] */
         "umulh	x12, x6, x7\n\t"
@@ -7663,7 +7663,7 @@ void fe_pow22523(fe r, const fe a)
         "ldp	x6, x7, [x29, #48]\n\t"
         "ldp	x8, x9, [x29, #64]\n\t"
         "\n"
-    "L_fe_pow22523_7_%=: \n\t"
+    "L_fe_pow22523_7_%=:\n\t"
         /* Square */
         /*  A[0] * A[1] */
         "umulh	x12, x6, x7\n\t"
diff --git a/wolfcrypt/src/port/arm/armv8-mlkem-asm.S b/wolfcrypt/src/port/arm/armv8-mlkem-asm.S
index 5b7df728433..566e10fcdaf 100644
--- a/wolfcrypt/src/port/arm/armv8-mlkem-asm.S
+++ b/wolfcrypt/src/port/arm/armv8-mlkem-asm.S
@@ -31,32 +31,34 @@
 #ifndef WOLFSSL_ARMASM_INLINE
 #ifndef __APPLE__
 	.text
-	.type	L_mlkem_aarch64_consts, %object
 	.section	.rodata
+	.type	L_mlkem_aarch64_consts, %object
 	.size	L_mlkem_aarch64_consts, 16
 #else
 	.section	__DATA,__data
 #endif /* __APPLE__ */
+	# 8-byte aligned, 64-bit aligned
 #ifndef __APPLE__
-	.align	2
+	.align	3
 #else
-	.p2align	2
+	.p2align	3
 #endif /* __APPLE__ */
 L_mlkem_aarch64_consts:
 	.short	0x0d01,0xf301,0x4ebf,0x0549,0x5049,0x0000,0x0000,0x0000
 #ifdef WOLFSSL_WC_MLKEM
 #ifndef __APPLE__
 	.text
-	.type	L_mlkem_aarch64_zetas, %object
 	.section	.rodata
+	.type	L_mlkem_aarch64_zetas, %object
 	.size	L_mlkem_aarch64_zetas, 576
 #else
 	.section	__DATA,__data
 #endif /* __APPLE__ */
+	# 8-byte aligned, 64-bit aligned
 #ifndef __APPLE__
-	.align	2
+	.align	3
 #else
-	.p2align	2
+	.p2align	3
 #endif /* __APPLE__ */
 L_mlkem_aarch64_zetas:
 	.short	0x08ed,0x0a0b,0x0b9a,0x0714,0x05d5,0x058e,0x011f,0x00ca
@@ -97,16 +99,17 @@ L_mlkem_aarch64_zetas:
 	.short	0x03be,0x03be,0x074d,0x074d,0x05f2,0x05f2,0x065c,0x065c
 #ifndef __APPLE__
 	.text
-	.type	L_mlkem_aarch64_zetas_qinv, %object
 	.section	.rodata
+	.type	L_mlkem_aarch64_zetas_qinv, %object
 	.size	L_mlkem_aarch64_zetas_qinv, 576
 #else
 	.section	__DATA,__data
 #endif /* __APPLE__ */
+	# 8-byte aligned, 64-bit aligned
 #ifndef __APPLE__
-	.align	2
+	.align	3
 #else
-	.p2align	2
+	.p2align	3
 #endif /* __APPLE__ */
 L_mlkem_aarch64_zetas_qinv:
 	.short	0xffed,0x7b0b,0x399a,0x0314,0x34d5,0xcf8e,0x6e1f,0xbeca
@@ -1441,16 +1444,17 @@ _mlkem_ntt:
 #endif /* __APPLE__ */
 #ifndef __APPLE__
 	.text
-	.type	L_mlkem_aarch64_zetas_inv, %object
 	.section	.rodata
+	.type	L_mlkem_aarch64_zetas_inv, %object
 	.size	L_mlkem_aarch64_zetas_inv, 576
 #else
 	.section	__DATA,__data
 #endif /* __APPLE__ */
+	# 8-byte aligned, 64-bit aligned
 #ifndef __APPLE__
-	.align	2
+	.align	3
 #else
-	.p2align	2
+	.p2align	3
 #endif /* __APPLE__ */
 L_mlkem_aarch64_zetas_inv:
 	.short	0x06a5,0x06a5,0x070f,0x070f,0x05b4,0x05b4,0x0943,0x0943
@@ -1491,16 +1495,17 @@ L_mlkem_aarch64_zetas_inv:
 	.short	0x0c37,0x0be2,0x0773,0x072c,0x05ed,0x0167,0x02f6,0x05a1
 #ifndef __APPLE__
 	.text
-	.type	L_mlkem_aarch64_zetas_inv_qinv, %object
 	.section	.rodata
+	.type	L_mlkem_aarch64_zetas_inv_qinv, %object
 	.size	L_mlkem_aarch64_zetas_inv_qinv, 576
 #else
 	.section	__DATA,__data
 #endif /* __APPLE__ */
+	# 8-byte aligned, 64-bit aligned
 #ifndef __APPLE__
-	.align	2
+	.align	3
 #else
-	.p2align	2
+	.p2align	3
 #endif /* __APPLE__ */
 L_mlkem_aarch64_zetas_inv_qinv:
 	.short	0xa5a5,0xa5a5,0x440f,0x440f,0xe1b4,0xe1b4,0xa243,0xa243
@@ -5481,16 +5486,17 @@ _mlkem_invntt_sqrdmlsh:
 #endif /* WOLFSSL_AARCH64_NO_SQRDMLSH */
 #ifndef __APPLE__
 	.text
-	.type	L_mlkem_aarch64_zetas_mul, %object
 	.section	.rodata
+	.type	L_mlkem_aarch64_zetas_mul, %object
 	.size	L_mlkem_aarch64_zetas_mul, 256
 #else
 	.section	__DATA,__data
 #endif /* __APPLE__ */
+	# 8-byte aligned, 64-bit aligned
 #ifndef __APPLE__
-	.align	2
+	.align	3
 #else
-	.p2align	2
+	.p2align	3
 #endif /* __APPLE__ */
 L_mlkem_aarch64_zetas_mul:
 	.short	0x08b2,0xf74e,0x01ae,0xfe52,0x022b,0xfdd5,0x034b,0xfcb5
@@ -6955,16 +6961,17 @@ _mlkem_basemul_mont_add:
 #endif /* __APPLE__ */
 #ifndef __APPLE__
 	.text
-	.type	L_mlkem_aarch64_q, %object
 	.section	.rodata
+	.type	L_mlkem_aarch64_q, %object
 	.size	L_mlkem_aarch64_q, 16
 #else
 	.section	__DATA,__data
 #endif /* __APPLE__ */
+	# 8-byte aligned, 64-bit aligned
 #ifndef __APPLE__
-	.align	2
+	.align	3
 #else
-	.p2align	2
+	.p2align	3
 #endif /* __APPLE__ */
 L_mlkem_aarch64_q:
 	.short	0x0d01,0x0d01,0x0d01,0x0d01,0x0d01,0x0d01,0x0d01,0x0d01
@@ -8160,46 +8167,49 @@ _mlkem_to_mont_sqrdmlsh:
 #endif /* WOLFSSL_AARCH64_NO_SQRDMLSH */
 #ifndef __APPLE__
 	.text
-	.type	L_mlkem_to_msg_low, %object
 	.section	.rodata
+	.type	L_mlkem_to_msg_low, %object
 	.size	L_mlkem_to_msg_low, 16
 #else
 	.section	__DATA,__data
 #endif /* __APPLE__ */
+	# 8-byte aligned, 64-bit aligned
 #ifndef __APPLE__
-	.align	2
+	.align	3
 #else
-	.p2align	2
+	.p2align	3
 #endif /* __APPLE__ */
 L_mlkem_to_msg_low:
 	.short	0x0373,0x0373,0x0373,0x0373,0x0373,0x0373,0x0373,0x0373
 #ifndef __APPLE__
 	.text
-	.type	L_mlkem_to_msg_high, %object
 	.section	.rodata
+	.type	L_mlkem_to_msg_high, %object
 	.size	L_mlkem_to_msg_high, 16
 #else
 	.section	__DATA,__data
 #endif /* __APPLE__ */
+	# 8-byte aligned, 64-bit aligned
 #ifndef __APPLE__
-	.align	2
+	.align	3
 #else
-	.p2align	2
+	.p2align	3
 #endif /* __APPLE__ */
 L_mlkem_to_msg_high:
 	.short	0x09c0,0x09c0,0x09c0,0x09c0,0x09c0,0x09c0,0x09c0,0x09c0
 #ifndef __APPLE__
 	.text
-	.type	L_mlkem_to_msg_bits, %object
 	.section	.rodata
+	.type	L_mlkem_to_msg_bits, %object
 	.size	L_mlkem_to_msg_bits, 16
 #else
 	.section	__DATA,__data
 #endif /* __APPLE__ */
+	# 8-byte aligned, 64-bit aligned
 #ifndef __APPLE__
-	.align	2
+	.align	3
 #else
-	.p2align	2
+	.p2align	3
 #endif /* __APPLE__ */
 L_mlkem_to_msg_bits:
 	.short	0x0001,0x0002,0x0004,0x0008,0x0010,0x0020,0x0040,0x0080
@@ -8456,31 +8466,33 @@ _mlkem_to_msg_neon:
 #endif /* __APPLE__ */
 #ifndef __APPLE__
 	.text
-	.type	L_mlkem_from_msg_q1half, %object
 	.section	.rodata
+	.type	L_mlkem_from_msg_q1half, %object
 	.size	L_mlkem_from_msg_q1half, 16
 #else
 	.section	__DATA,__data
 #endif /* __APPLE__ */
+	# 8-byte aligned, 64-bit aligned
 #ifndef __APPLE__
-	.align	2
+	.align	3
 #else
-	.p2align	2
+	.p2align	3
 #endif /* __APPLE__ */
 L_mlkem_from_msg_q1half:
 	.short	0x0681,0x0681,0x0681,0x0681,0x0681,0x0681,0x0681,0x0681
 #ifndef __APPLE__
 	.text
-	.type	L_mlkem_from_msg_bits, %object
 	.section	.rodata
+	.type	L_mlkem_from_msg_bits, %object
 	.size	L_mlkem_from_msg_bits, 16
 #else
 	.section	__DATA,__data
 #endif /* __APPLE__ */
+	# 8-byte aligned, 64-bit aligned
 #ifndef __APPLE__
-	.align	1
+	.align	3
 #else
-	.p2align	1
+	.p2align	3
 #endif /* __APPLE__ */
 L_mlkem_from_msg_bits:
 	.byte	0x01,0x02,0x04,0x08,0x10,0x20,0x40,0x80
@@ -8941,46 +8953,49 @@ L_mlkem_aarch64_cmp_neon_done:
 #endif /* __APPLE__ */
 #ifndef __APPLE__
 	.text
-	.type	L_mlkem_rej_uniform_mask, %object
 	.section	.rodata
+	.type	L_mlkem_rej_uniform_mask, %object
 	.size	L_mlkem_rej_uniform_mask, 16
 #else
 	.section	__DATA,__data
 #endif /* __APPLE__ */
+	# 8-byte aligned, 64-bit aligned
 #ifndef __APPLE__
-	.align	2
+	.align	3
 #else
-	.p2align	2
+	.p2align	3
 #endif /* __APPLE__ */
 L_mlkem_rej_uniform_mask:
 	.short	0x0fff,0x0fff,0x0fff,0x0fff,0x0fff,0x0fff,0x0fff,0x0fff
 #ifndef __APPLE__
 	.text
-	.type	L_mlkem_rej_uniform_bits, %object
 	.section	.rodata
+	.type	L_mlkem_rej_uniform_bits, %object
 	.size	L_mlkem_rej_uniform_bits, 16
 #else
 	.section	__DATA,__data
 #endif /* __APPLE__ */
+	# 8-byte aligned, 64-bit aligned
 #ifndef __APPLE__
-	.align	2
+	.align	3
 #else
-	.p2align	2
+	.p2align	3
 #endif /* __APPLE__ */
 L_mlkem_rej_uniform_bits:
 	.short	0x0001,0x0002,0x0004,0x0008,0x0010,0x0020,0x0040,0x0080
 #ifndef __APPLE__
 	.text
-	.type	L_mlkem_rej_uniform_indices, %object
 	.section	.rodata
+	.type	L_mlkem_rej_uniform_indices, %object
 	.size	L_mlkem_rej_uniform_indices, 4096
 #else
 	.section	__DATA,__data
 #endif /* __APPLE__ */
+	# 8-byte aligned, 64-bit aligned
 #ifndef __APPLE__
-	.align	1
+	.align	3
 #else
-	.p2align	1
+	.p2align	3
 #endif /* __APPLE__ */
 L_mlkem_rej_uniform_indices:
 	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
@@ -9688,42 +9703,31 @@ L_mlkem_rej_uniform_done:
 #endif /* __APPLE__ */
 #ifndef __APPLE__
 	.text
-	.type	L_sha3_aarch64_r, %object
 	.section	.rodata
+	.type	L_sha3_aarch64_r, %object
 	.size	L_sha3_aarch64_r, 192
 #else
 	.section	__DATA,__data
 #endif /* __APPLE__ */
+	# 16-byte aligned, 128-bit aligned
 #ifndef __APPLE__
-	.align	3
+	.align	4
 #else
-	.p2align	3
+	.p2align	4
 #endif /* __APPLE__ */
 L_sha3_aarch64_r:
-	.xword	0x0000000000000001
-	.xword	0x0000000000008082
-	.xword	0x800000000000808a
-	.xword	0x8000000080008000
-	.xword	0x000000000000808b
-	.xword	0x0000000080000001
-	.xword	0x8000000080008081
-	.xword	0x8000000000008009
-	.xword	0x000000000000008a
-	.xword	0x0000000000000088
-	.xword	0x0000000080008009
-	.xword	0x000000008000000a
-	.xword	0x000000008000808b
-	.xword	0x800000000000008b
-	.xword	0x8000000000008089
-	.xword	0x8000000000008003
-	.xword	0x8000000000008002
-	.xword	0x8000000000000080
-	.xword	0x000000000000800a
-	.xword	0x800000008000000a
-	.xword	0x8000000080008081
-	.xword	0x8000000000008080
-	.xword	0x0000000080000001
-	.xword	0x8000000080008008
+	.quad	0x0000000000000001,0x0000000000008082
+	.quad	0x800000000000808a,0x8000000080008000
+	.quad	0x000000000000808b,0x0000000080000001
+	.quad	0x8000000080008081,0x8000000000008009
+	.quad	0x000000000000008a,0x0000000000000088
+	.quad	0x0000000080008009,0x000000008000000a
+	.quad	0x000000008000808b,0x800000000000008b
+	.quad	0x8000000000008089,0x8000000000008003
+	.quad	0x8000000000008002,0x8000000000000080
+	.quad	0x000000000000800a,0x800000008000000a
+	.quad	0x8000000080008081,0x8000000000008080
+	.quad	0x0000000080000001,0x8000000080008008
 #ifdef WOLFSSL_ARMASM_CRYPTO_SHA3
 #ifndef __APPLE__
 .text
diff --git a/wolfcrypt/src/port/arm/armv8-mlkem-asm_c.c b/wolfcrypt/src/port/arm/armv8-mlkem-asm_c.c
index 9e5780815f9..6f7ba392a24 100644
--- a/wolfcrypt/src/port/arm/armv8-mlkem-asm_c.c
+++ b/wolfcrypt/src/port/arm/armv8-mlkem-asm_c.c
@@ -30,14 +30,14 @@
 #ifdef WOLFSSL_ARMASM
 #ifdef __aarch64__
 #ifdef WOLFSSL_ARMASM_INLINE
-static const word16 L_mlkem_aarch64_consts[] = {
+XALIGNED(4) static const word16 L_mlkem_aarch64_consts[] = {
     0x0d01, 0xf301, 0x4ebf, 0x0549, 0x5049, 0x0000, 0x0000, 0x0000,
 };
 
 #include <wolfssl/wolfcrypt/wc_mlkem.h>
 
 #ifdef WOLFSSL_WC_MLKEM
-static const word16 L_mlkem_aarch64_zetas[] = {
+XALIGNED(4) static const word16 L_mlkem_aarch64_zetas[] = {
     0x08ed, 0x0a0b, 0x0b9a, 0x0714, 0x05d5, 0x058e, 0x011f, 0x00ca,
     0x0c56, 0x026e, 0x0629, 0x00b6, 0x03c2, 0x084f, 0x073f, 0x05bc,
     0x023d, 0x07d4, 0x0108, 0x017f, 0x09c4, 0x05b2, 0x06bf, 0x0c7f,
@@ -76,7 +76,7 @@ static const word16 L_mlkem_aarch64_zetas[] = {
     0x03be, 0x03be, 0x074d, 0x074d, 0x05f2, 0x05f2, 0x065c, 0x065c,
 };
 
-static const word16 L_mlkem_aarch64_zetas_qinv[] = {
+XALIGNED(4) static const word16 L_mlkem_aarch64_zetas_qinv[] = {
     0xffed, 0x7b0b, 0x399a, 0x0314, 0x34d5, 0xcf8e, 0x6e1f, 0xbeca,
     0xae56, 0x6c6e, 0xf129, 0xc2b6, 0x29c2, 0x054f, 0xd43f, 0x79bc,
     0xe93d, 0x43d4, 0x9908, 0x8e7f, 0x15c4, 0xfbb2, 0x53bf, 0x997f,
@@ -1376,7 +1376,7 @@ void mlkem_ntt(sword16* r)
     );
 }
 
-static const word16 L_mlkem_aarch64_zetas_inv[] = {
+XALIGNED(4) static const word16 L_mlkem_aarch64_zetas_inv[] = {
     0x06a5, 0x06a5, 0x070f, 0x070f, 0x05b4, 0x05b4, 0x0943, 0x0943,
     0x0922, 0x0922, 0x091d, 0x091d, 0x0134, 0x0134, 0x006c, 0x006c,
     0x0b23, 0x0b23, 0x0366, 0x0366, 0x0356, 0x0356, 0x05e6, 0x05e6,
@@ -1415,7 +1415,7 @@ static const word16 L_mlkem_aarch64_zetas_inv[] = {
     0x0c37, 0x0be2, 0x0773, 0x072c, 0x05ed, 0x0167, 0x02f6, 0x05a1,
 };
 
-static const word16 L_mlkem_aarch64_zetas_inv_qinv[] = {
+XALIGNED(4) static const word16 L_mlkem_aarch64_zetas_inv_qinv[] = {
     0xa5a5, 0xa5a5, 0x440f, 0x440f, 0xe1b4, 0xe1b4, 0xa243, 0xa243,
     0x4f22, 0x4f22, 0x901d, 0x901d, 0x5d34, 0x5d34, 0x846c, 0x846c,
     0x4423, 0x4423, 0xd566, 0xd566, 0xa556, 0xa556, 0x57e6, 0x57e6,
@@ -5295,7 +5295,7 @@ void mlkem_invntt_sqrdmlsh(sword16* r)
 }
 
 #endif /* WOLFSSL_AARCH64_NO_SQRDMLSH */
-static const word16 L_mlkem_aarch64_zetas_mul[] = {
+XALIGNED(4) static const word16 L_mlkem_aarch64_zetas_mul[] = {
     0x08b2, 0xf74e, 0x01ae, 0xfe52, 0x022b, 0xfdd5, 0x034b, 0xfcb5,
     0x081e, 0xf7e2, 0x0367, 0xfc99, 0x060e, 0xf9f2, 0x0069, 0xff97,
     0x01a6, 0xfe5a, 0x024b, 0xfdb5, 0x00b1, 0xff4f, 0x0c16, 0xf3ea,
@@ -6703,7 +6703,7 @@ void mlkem_basemul_mont_add(sword16* r, const sword16* a, const sword16* b)
     );
 }
 
-static const word16 L_mlkem_aarch64_q[] = {
+XALIGNED(4) static const word16 L_mlkem_aarch64_q[] = {
     0x0d01, 0x0d01, 0x0d01, 0x0d01, 0x0d01, 0x0d01, 0x0d01, 0x0d01,
 };
 
@@ -7761,15 +7761,15 @@ void mlkem_to_mont_sqrdmlsh(sword16* p)
 }
 
 #endif /* WOLFSSL_AARCH64_NO_SQRDMLSH */
-static const word16 L_mlkem_to_msg_low[] = {
+XALIGNED(4) static const word16 L_mlkem_to_msg_low[] = {
     0x0373, 0x0373, 0x0373, 0x0373, 0x0373, 0x0373, 0x0373, 0x0373,
 };
 
-static const word16 L_mlkem_to_msg_high[] = {
+XALIGNED(4) static const word16 L_mlkem_to_msg_high[] = {
     0x09c0, 0x09c0, 0x09c0, 0x09c0, 0x09c0, 0x09c0, 0x09c0, 0x09c0,
 };
 
-static const word16 L_mlkem_to_msg_bits[] = {
+XALIGNED(4) static const word16 L_mlkem_to_msg_bits[] = {
     0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080,
 };
 
@@ -7990,11 +7990,11 @@ void mlkem_to_msg_neon(byte* msg, sword16* p)
     );
 }
 
-static const word16 L_mlkem_from_msg_q1half[] = {
+XALIGNED(4) static const word16 L_mlkem_from_msg_q1half[] = {
     0x0681, 0x0681, 0x0681, 0x0681, 0x0681, 0x0681, 0x0681, 0x0681,
 };
 
-static const word8 L_mlkem_from_msg_bits[] = {
+XALIGNED(4) static const word8 L_mlkem_from_msg_bits[] = {
     0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
     0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
 };
@@ -8400,7 +8400,7 @@ int mlkem_cmp_neon(const byte* a, const byte* b, int sz)
         "orr	v8.16b, v8.16b, v0.16b\n\t"
         "orr	v9.16b, v9.16b, v1.16b\n\t"
         "\n"
-    "L_mlkem_aarch64_cmp_neon_done_%=: \n\t"
+    "L_mlkem_aarch64_cmp_neon_done_%=:\n\t"
         "orr	v8.16b, v8.16b, v9.16b\n\t"
         "orr	v10.16b, v10.16b, v11.16b\n\t"
         "orr	v8.16b, v8.16b, v10.16b\n\t"
@@ -8417,15 +8417,15 @@ int mlkem_cmp_neon(const byte* a, const byte* b, int sz)
     return (word32)(size_t)a;
 }
 
-static const word16 L_mlkem_rej_uniform_mask[] = {
+XALIGNED(4) static const word16 L_mlkem_rej_uniform_mask[] = {
     0x0fff, 0x0fff, 0x0fff, 0x0fff, 0x0fff, 0x0fff, 0x0fff, 0x0fff,
 };
 
-static const word16 L_mlkem_rej_uniform_bits[] = {
+XALIGNED(4) static const word16 L_mlkem_rej_uniform_bits[] = {
     0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080,
 };
 
-static const word8 L_mlkem_rej_uniform_indices[] = {
+XALIGNED(4) static const word8 L_mlkem_rej_uniform_indices[] = {
     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
     0x00, 0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
@@ -8963,7 +8963,7 @@ unsigned int mlkem_rej_uniform_neon(sword16* p, unsigned int len, const byte* r,
         "subs	wzr, %w[len], #16\n\t"
         "b.lt	L_mlkem_rej_uniform_loop_4_%=\n\t"
         "\n"
-    "L_mlkem_rej_uniform_loop_16_%=: \n\t"
+    "L_mlkem_rej_uniform_loop_16_%=:\n\t"
         "ld3	{v4.8b, v5.8b, v6.8b}, [%x[r]], #24\n\t"
         "zip1	v4.16b, v4.16b, v1.16b\n\t"
         "zip1	v5.16b, v5.16b, v1.16b\n\t"
@@ -9010,7 +9010,7 @@ unsigned int mlkem_rej_uniform_neon(sword16* p, unsigned int len, const byte* r,
         "b.lt	L_mlkem_rej_uniform_loop_4_%=\n\t"
         "b	L_mlkem_rej_uniform_loop_16_%=\n\t"
         "\n"
-    "L_mlkem_rej_uniform_loop_4_%=: \n\t"
+    "L_mlkem_rej_uniform_loop_4_%=:\n\t"
         "subs	w10, %w[len], w12\n\t"
         "b.eq	L_mlkem_rej_uniform_done_%=\n\t"
         "subs	x10, x10, #4\n\t"
@@ -9047,7 +9047,7 @@ unsigned int mlkem_rej_uniform_neon(sword16* p, unsigned int len, const byte* r,
         "b.eq	L_mlkem_rej_uniform_done_%=\n\t"
         "b	L_mlkem_rej_uniform_loop_4_%=\n\t"
         "\n"
-    "L_mlkem_rej_uniform_loop_lt_4_%=: \n\t"
+    "L_mlkem_rej_uniform_loop_lt_4_%=:\n\t"
         "ldr	%[mask], [%x[r]], #6\n\t"
         "lsr	%[q], %[mask], #12\n\t"
         "lsr	%[bits], %[mask], #24\n\t"
@@ -9088,7 +9088,7 @@ unsigned int mlkem_rej_uniform_neon(sword16* p, unsigned int len, const byte* r,
         "b.eq	L_mlkem_rej_uniform_done_%=\n\t"
         "b	L_mlkem_rej_uniform_loop_lt_4_%=\n\t"
         "\n"
-    "L_mlkem_rej_uniform_done_%=: \n\t"
+    "L_mlkem_rej_uniform_done_%=:\n\t"
         "mov	x0, x12\n\t"
         : [p] "+r" (p), [len] "+r" (len), [rLen] "+r" (rLen)
         : [r] "r" (r), [mask] "r" (mask), [q] "r" (q), [bits] "r" (bits),
@@ -9100,19 +9100,19 @@ unsigned int mlkem_rej_uniform_neon(sword16* p, unsigned int len, const byte* r,
     return (word32)(size_t)p;
 }
 
-static const word64 L_sha3_aarch64_r[] = {
-    0x0000000000000001, 0x0000000000008082,
-    0x800000000000808a, 0x8000000080008000,
-    0x000000000000808b, 0x0000000080000001,
-    0x8000000080008081, 0x8000000000008009,
-    0x000000000000008a, 0x0000000000000088,
-    0x0000000080008009, 0x000000008000000a,
-    0x000000008000808b, 0x800000000000008b,
-    0x8000000000008089, 0x8000000000008003,
-    0x8000000000008002, 0x8000000000000080,
-    0x000000000000800a, 0x800000008000000a,
-    0x8000000080008081, 0x8000000000008080,
-    0x0000000080000001, 0x8000000080008008,
+XALIGNED(16) static const word64 L_sha3_aarch64_r[] = {
+    0x0000000000000001UL, 0x0000000000008082UL,
+    0x800000000000808aUL, 0x8000000080008000UL,
+    0x000000000000808bUL, 0x0000000080000001UL,
+    0x8000000080008081UL, 0x8000000000008009UL,
+    0x000000000000008aUL, 0x0000000000000088UL,
+    0x0000000080008009UL, 0x000000008000000aUL,
+    0x000000008000808bUL, 0x800000000000008bUL,
+    0x8000000000008089UL, 0x8000000000008003UL,
+    0x8000000000008002UL, 0x8000000000000080UL,
+    0x000000000000800aUL, 0x800000008000000aUL,
+    0x8000000080008081UL, 0x8000000000008080UL,
+    0x0000000080000001UL, 0x8000000080008008UL,
 };
 
 #ifdef WOLFSSL_ARMASM_CRYPTO_SHA3
@@ -9155,7 +9155,7 @@ void mlkem_sha3_blocksx3_neon(word64* state)
         "mov	x28, #24\n\t"
         /* Start of 24 rounds */
         "\n"
-    "L_SHA3_transform_blocksx3_neon_begin_%=: \n\t"
+    "L_SHA3_transform_blocksx3_neon_begin_%=:\n\t"
         "stp	%[r], x28, [x29, #48]\n\t"
         /* Col Mix */
         "eor3	v31.16b, v0.16b, v5.16b, v10.16b\n\t"
@@ -9476,7 +9476,7 @@ void mlkem_shake128_blocksx3_seed_neon(word64* state, byte* seed)
         "mov	%x[seed], #24\n\t"
         /* Start of 24 rounds */
         "\n"
-    "L_SHA3_shake128_blocksx3_seed_neon_begin_%=: \n\t"
+    "L_SHA3_shake128_blocksx3_seed_neon_begin_%=:\n\t"
         "stp	%[r], %x[seed], [x29, #48]\n\t"
         /* Col Mix */
         "eor3	v31.16b, v0.16b, v5.16b, v10.16b\n\t"
@@ -9797,7 +9797,7 @@ void mlkem_shake256_blocksx3_seed_neon(word64* state, byte* seed)
         "mov	%x[seed], #24\n\t"
         /* Start of 24 rounds */
         "\n"
-    "L_SHA3_shake256_blocksx3_seed_neon_begin_%=: \n\t"
+    "L_SHA3_shake256_blocksx3_seed_neon_begin_%=:\n\t"
         "stp	%[r], %x[seed], [x29, #48]\n\t"
         /* Col Mix */
         "eor3	v31.16b, v0.16b, v5.16b, v10.16b\n\t"
@@ -10097,7 +10097,7 @@ void mlkem_sha3_blocksx3_neon(word64* state)
         "mov	x28, #24\n\t"
         /* Start of 24 rounds */
         "\n"
-    "L_SHA3_transform_blocksx3_neon_begin_%=: \n\t"
+    "L_SHA3_transform_blocksx3_neon_begin_%=:\n\t"
         "stp	%[r], x28, [x29, #48]\n\t"
         /* Col Mix NEON */
         "eor	v30.16b, v4.16b, v9.16b\n\t"
@@ -10503,7 +10503,7 @@ void mlkem_shake128_blocksx3_seed_neon(word64* state, byte* seed)
         "mov	%x[seed], #24\n\t"
         /* Start of 24 rounds */
         "\n"
-    "L_SHA3_shake128_blocksx3_seed_neon_begin_%=: \n\t"
+    "L_SHA3_shake128_blocksx3_seed_neon_begin_%=:\n\t"
         "stp	%[r], %x[seed], [x29, #48]\n\t"
         /* Col Mix NEON */
         "eor	v30.16b, v4.16b, v9.16b\n\t"
@@ -10909,7 +10909,7 @@ void mlkem_shake256_blocksx3_seed_neon(word64* state, byte* seed)
         "mov	%x[seed], #24\n\t"
         /* Start of 24 rounds */
         "\n"
-    "L_SHA3_shake256_blocksx3_seed_neon_begin_%=: \n\t"
+    "L_SHA3_shake256_blocksx3_seed_neon_begin_%=:\n\t"
         "stp	%[r], %x[seed], [x29, #48]\n\t"
         /* Col Mix NEON */
         "eor	v30.16b, v4.16b, v9.16b\n\t"
diff --git a/wolfcrypt/src/port/arm/armv8-poly1305-asm.S b/wolfcrypt/src/port/arm/armv8-poly1305-asm.S
index ef7b908e096..00897f153bb 100644
--- a/wolfcrypt/src/port/arm/armv8-poly1305-asm.S
+++ b/wolfcrypt/src/port/arm/armv8-poly1305-asm.S
@@ -438,22 +438,20 @@ L_poly1305_arm64_blocks_done_all:
 #endif /* __APPLE__ */
 #ifndef __APPLE__
 	.text
-	.type	L_poly1305_set_key_arm64_clamp, %object
 	.section	.rodata
+	.type	L_poly1305_set_key_arm64_clamp, %object
 	.size	L_poly1305_set_key_arm64_clamp, 16
 #else
 	.section	__DATA,__data
 #endif /* __APPLE__ */
+	# 8-byte aligned, 64-bit aligned
 #ifndef __APPLE__
 	.align	3
 #else
 	.p2align	3
 #endif /* __APPLE__ */
 L_poly1305_set_key_arm64_clamp:
-	.word	0x0fffffff
-	.word	0x0ffffffc
-	.word	0x0ffffffc
-	.word	0x0ffffffc
+	.long	0x0fffffff,0x0ffffffc,0x0ffffffc,0x0ffffffc
 #ifndef __APPLE__
 .text
 .globl	poly1305_set_key
diff --git a/wolfcrypt/src/port/arm/armv8-poly1305-asm_c.c b/wolfcrypt/src/port/arm/armv8-poly1305-asm_c.c
index 5ba43381043..26429dcd3ee 100644
--- a/wolfcrypt/src/port/arm/armv8-poly1305-asm_c.c
+++ b/wolfcrypt/src/port/arm/armv8-poly1305-asm_c.c
@@ -138,7 +138,7 @@ void poly1305_arm64_blocks(Poly1305* ctx, const unsigned char* m, size_t bytes)
         "mul	v22.4s, v18.4s, v24.4s\n\t"
         "mul	v23.4s, v19.4s, v24.4s\n\t"
         "\n"
-    "L_poly1305_arm64_blocks_loop_64_%=: \n\t"
+    "L_poly1305_arm64_blocks_loop_64_%=:\n\t"
         /* Load message of 64 bytes - setting hi bit for not finished */
         "ld4	{v5.4s, v6.4s, v7.4s, v8.4s}, [%x[m]], #0x40\n\t"
         "sub	%x[bytes], %x[bytes], #0x40\n\t"
@@ -314,7 +314,7 @@ void poly1305_arm64_blocks(Poly1305* ctx, const unsigned char* m, size_t bytes)
         "usra	v4.2d, v3.2d, #26\n\t"
         "and	v3.16b, v3.16b, v26.16b\n\t"
         "\n"
-    "L_poly1305_arm64_blocks_done_32_%=: \n\t"
+    "L_poly1305_arm64_blocks_done_32_%=:\n\t"
         "cmp	%x[bytes], #16\n\t"
         "b.eq	L_poly1305_arm64_blocks_transfer_%=\n\t"
         "add	x14, %x[ctx], #0x60\n\t"
@@ -322,7 +322,7 @@ void poly1305_arm64_blocks(Poly1305* ctx, const unsigned char* m, size_t bytes)
         "st1	{v4.s}[0], [x14]\n\t"
         "b	L_poly1305_arm64_blocks_done_all_%=\n\t"
         "\n"
-    "L_poly1305_arm64_blocks_transfer_%=: \n\t"
+    "L_poly1305_arm64_blocks_transfer_%=:\n\t"
         "mov	w3, v0.s[0]\n\t"
         "mov	w4, v1.s[0]\n\t"
         "mov	w5, v2.s[0]\n\t"
@@ -330,7 +330,7 @@ void poly1305_arm64_blocks(Poly1305* ctx, const unsigned char* m, size_t bytes)
         "mov	w7, v4.s[0]\n\t"
         "b	L_poly1305_arm64_blocks_start_%=\n\t"
         "\n"
-    "L_poly1305_arm64_blocks_done_%=: \n\t"
+    "L_poly1305_arm64_blocks_done_%=:\n\t"
         "cmp	%x[bytes], #16\n\t"
         "b.lt	L_poly1305_arm64_blocks_done_all_%=\n\t"
         /* Load h */
@@ -338,7 +338,7 @@ void poly1305_arm64_blocks(Poly1305* ctx, const unsigned char* m, size_t bytes)
         "ldp	w5, w6, [%x[ctx], #104]\n\t"
         "ldr	w7, [%x[ctx], #112]\n\t"
         "\n"
-    "L_poly1305_arm64_blocks_start_%=: \n\t"
+    "L_poly1305_arm64_blocks_start_%=:\n\t"
         "mov	x17, #1\n\t"
         /* Load r */
         "ldp	x8, x9, [%x[ctx]]\n\t"
@@ -350,7 +350,7 @@ void poly1305_arm64_blocks(Poly1305* ctx, const unsigned char* m, size_t bytes)
         "lsr	x5, x7, #24\n\t"
         "add	x4, x4, x7, lsl 40\n\t"
         "\n"
-    "L_poly1305_arm64_blocks_loop_%=: \n\t"
+    "L_poly1305_arm64_blocks_loop_%=:\n\t"
         /* Load m */
         "ldr	x14, [%x[m]]\n\t"
         "ldr	x15, [%x[m], #8]\n\t"
@@ -413,7 +413,7 @@ void poly1305_arm64_blocks(Poly1305* ctx, const unsigned char* m, size_t bytes)
         "stp	w5, w6, [%x[ctx], #104]\n\t"
         "str	w7, [%x[ctx], #112]\n\t"
         "\n"
-    "L_poly1305_arm64_blocks_done_all_%=: \n\t"
+    "L_poly1305_arm64_blocks_done_all_%=:\n\t"
         : [ctx] "+r" (ctx), [bytes] "+r" (bytes)
         : [m] "r" (m)
         : "memory", "cc", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10",
@@ -424,7 +424,7 @@ void poly1305_arm64_blocks(Poly1305* ctx, const unsigned char* m, size_t bytes)
     );
 }
 
-static const word32 L_poly1305_set_key_arm64_clamp[] = {
+XALIGNED(8) static const word32 L_poly1305_set_key_arm64_clamp[] = {
     0x0fffffff, 0x0ffffffc, 0x0ffffffc, 0x0ffffffc,
 };
 
diff --git a/wolfcrypt/src/port/arm/armv8-sha256-asm.S b/wolfcrypt/src/port/arm/armv8-sha256-asm.S
index b764a3fef60..669e28021db 100644
--- a/wolfcrypt/src/port/arm/armv8-sha256-asm.S
+++ b/wolfcrypt/src/port/arm/armv8-sha256-asm.S
@@ -32,82 +32,35 @@
 #if  !defined(NO_SHA256) || defined(WOLFSSL_SHA224)
 #ifndef __APPLE__
 	.text
-	.type	L_SHA256_transform_neon_len_k, %object
 	.section	.rodata
+	.type	L_SHA256_transform_neon_len_k, %object
 	.size	L_SHA256_transform_neon_len_k, 256
 #else
 	.section	__DATA,__data
 #endif /* __APPLE__ */
+	# 8-byte aligned, 64-bit aligned
 #ifndef __APPLE__
 	.align	3
 #else
 	.p2align	3
 #endif /* __APPLE__ */
 L_SHA256_transform_neon_len_k:
-	.word	0x428a2f98
-	.word	0x71374491
-	.word	0xb5c0fbcf
-	.word	0xe9b5dba5
-	.word	0x3956c25b
-	.word	0x59f111f1
-	.word	0x923f82a4
-	.word	0xab1c5ed5
-	.word	0xd807aa98
-	.word	0x12835b01
-	.word	0x243185be
-	.word	0x550c7dc3
-	.word	0x72be5d74
-	.word	0x80deb1fe
-	.word	0x9bdc06a7
-	.word	0xc19bf174
-	.word	0xe49b69c1
-	.word	0xefbe4786
-	.word	0x0fc19dc6
-	.word	0x240ca1cc
-	.word	0x2de92c6f
-	.word	0x4a7484aa
-	.word	0x5cb0a9dc
-	.word	0x76f988da
-	.word	0x983e5152
-	.word	0xa831c66d
-	.word	0xb00327c8
-	.word	0xbf597fc7
-	.word	0xc6e00bf3
-	.word	0xd5a79147
-	.word	0x06ca6351
-	.word	0x14292967
-	.word	0x27b70a85
-	.word	0x2e1b2138
-	.word	0x4d2c6dfc
-	.word	0x53380d13
-	.word	0x650a7354
-	.word	0x766a0abb
-	.word	0x81c2c92e
-	.word	0x92722c85
-	.word	0xa2bfe8a1
-	.word	0xa81a664b
-	.word	0xc24b8b70
-	.word	0xc76c51a3
-	.word	0xd192e819
-	.word	0xd6990624
-	.word	0xf40e3585
-	.word	0x106aa070
-	.word	0x19a4c116
-	.word	0x1e376c08
-	.word	0x2748774c
-	.word	0x34b0bcb5
-	.word	0x391c0cb3
-	.word	0x4ed8aa4a
-	.word	0x5b9cca4f
-	.word	0x682e6ff3
-	.word	0x748f82ee
-	.word	0x78a5636f
-	.word	0x84c87814
-	.word	0x8cc70208
-	.word	0x90befffa
-	.word	0xa4506ceb
-	.word	0xbef9a3f7
-	.word	0xc67178f2
+	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 #ifndef __APPLE__
 .text
 .globl	Transform_Sha256_Len_neon
@@ -1101,82 +1054,35 @@ L_sha256_len_neon_start:
 #ifndef WOLFSSL_ARMASM_NO_HW_CRYPTO
 #ifndef __APPLE__
 	.text
-	.type	L_SHA256_trans_crypto_len_k, %object
 	.section	.rodata
+	.type	L_SHA256_trans_crypto_len_k, %object
 	.size	L_SHA256_trans_crypto_len_k, 256
 #else
 	.section	__DATA,__data
 #endif /* __APPLE__ */
+	# 8-byte aligned, 64-bit aligned
 #ifndef __APPLE__
 	.align	3
 #else
 	.p2align	3
 #endif /* __APPLE__ */
 L_SHA256_trans_crypto_len_k:
-	.word	0x428a2f98
-	.word	0x71374491
-	.word	0xb5c0fbcf
-	.word	0xe9b5dba5
-	.word	0x3956c25b
-	.word	0x59f111f1
-	.word	0x923f82a4
-	.word	0xab1c5ed5
-	.word	0xd807aa98
-	.word	0x12835b01
-	.word	0x243185be
-	.word	0x550c7dc3
-	.word	0x72be5d74
-	.word	0x80deb1fe
-	.word	0x9bdc06a7
-	.word	0xc19bf174
-	.word	0xe49b69c1
-	.word	0xefbe4786
-	.word	0x0fc19dc6
-	.word	0x240ca1cc
-	.word	0x2de92c6f
-	.word	0x4a7484aa
-	.word	0x5cb0a9dc
-	.word	0x76f988da
-	.word	0x983e5152
-	.word	0xa831c66d
-	.word	0xb00327c8
-	.word	0xbf597fc7
-	.word	0xc6e00bf3
-	.word	0xd5a79147
-	.word	0x06ca6351
-	.word	0x14292967
-	.word	0x27b70a85
-	.word	0x2e1b2138
-	.word	0x4d2c6dfc
-	.word	0x53380d13
-	.word	0x650a7354
-	.word	0x766a0abb
-	.word	0x81c2c92e
-	.word	0x92722c85
-	.word	0xa2bfe8a1
-	.word	0xa81a664b
-	.word	0xc24b8b70
-	.word	0xc76c51a3
-	.word	0xd192e819
-	.word	0xd6990624
-	.word	0xf40e3585
-	.word	0x106aa070
-	.word	0x19a4c116
-	.word	0x1e376c08
-	.word	0x2748774c
-	.word	0x34b0bcb5
-	.word	0x391c0cb3
-	.word	0x4ed8aa4a
-	.word	0x5b9cca4f
-	.word	0x682e6ff3
-	.word	0x748f82ee
-	.word	0x78a5636f
-	.word	0x84c87814
-	.word	0x8cc70208
-	.word	0x90befffa
-	.word	0xa4506ceb
-	.word	0xbef9a3f7
-	.word	0xc67178f2
+	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 #ifndef __APPLE__
 .text
 .globl	Transform_Sha256_Len_crypto
diff --git a/wolfcrypt/src/port/arm/armv8-sha256-asm_c.c b/wolfcrypt/src/port/arm/armv8-sha256-asm_c.c
index 671badfc146..5e5c05aeb96 100644
--- a/wolfcrypt/src/port/arm/armv8-sha256-asm_c.c
+++ b/wolfcrypt/src/port/arm/armv8-sha256-asm_c.c
@@ -33,7 +33,7 @@
 #include <wolfssl/wolfcrypt/sha256.h>
 
 #if  !defined(NO_SHA256) || defined(WOLFSSL_SHA224)
-static const word32 L_SHA256_transform_neon_len_k[] = {
+XALIGNED(8) static const word32 L_SHA256_transform_neon_len_k[] = {
     0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
     0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
     0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
@@ -67,7 +67,7 @@ void Transform_Sha256_Len_neon(wc_Sha256* sha256, const byte* data, word32 len)
         "ldr	w11, [%x[sha256], #28]\n\t"
         /* Start of loop processing a block */
         "\n"
-    "L_sha256_len_neon_begin_%=: \n\t"
+    "L_sha256_len_neon_begin_%=:\n\t"
         /* Load W */
         /* Copy digest to add in at end */
         "ld1	{v0.8b, v1.8b, v2.8b, v3.8b}, [%x[data]], #32\n\t"
@@ -91,7 +91,7 @@ void Transform_Sha256_Len_neon(wc_Sha256* sha256, const byte* data, word32 len)
         "mov	x24, #3\n\t"
         /* Start of 16 rounds */
         "\n"
-    "L_sha256_len_neon_start_%=: \n\t"
+    "L_sha256_len_neon_start_%=:\n\t"
         /* Round 0 */
         "mov	w14, v0.s[0]\n\t"
         "ror	w12, w8, #6\n\t"
@@ -1020,7 +1020,7 @@ void Transform_Sha256_Len_neon(wc_Sha256* sha256, const byte* data, word32 len)
 }
 
 #ifndef WOLFSSL_ARMASM_NO_HW_CRYPTO
-static const word32 L_SHA256_trans_crypto_len_k[] = {
+XALIGNED(8) static const word32 L_SHA256_trans_crypto_len_k[] = {
     0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
     0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
     0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
@@ -1055,7 +1055,7 @@ void Transform_Sha256_Len_crypto(wc_Sha256* sha256, const byte* data,
         "ld1	{v0.4s, v1.4s}, [%x[sha256]]\n\t"
         /* Start of loop processing a block */
         "\n"
-    "L_sha256_len_crypto_begin_%=: \n\t"
+    "L_sha256_len_crypto_begin_%=:\n\t"
         /* Load W */
         "ld1	{v4.4s, v5.4s, v6.4s, v7.4s}, [%x[data]], #0x40\n\t"
         "rev32	v4.16b, v4.16b\n\t"
diff --git a/wolfcrypt/src/port/arm/armv8-sha3-asm.S b/wolfcrypt/src/port/arm/armv8-sha3-asm.S
index fb2c3dd79b0..631432169ba 100644
--- a/wolfcrypt/src/port/arm/armv8-sha3-asm.S
+++ b/wolfcrypt/src/port/arm/armv8-sha3-asm.S
@@ -33,42 +33,31 @@
 #ifdef WOLFSSL_ARMASM_CRYPTO_SHA3
 #ifndef __APPLE__
 	.text
-	.type	L_SHA3_transform_crypto_r, %object
 	.section	.rodata
+	.type	L_SHA3_transform_crypto_r, %object
 	.size	L_SHA3_transform_crypto_r, 192
 #else
 	.section	__DATA,__data
 #endif /* __APPLE__ */
+	# 16-byte aligned, 128-bit aligned
 #ifndef __APPLE__
-	.align	3
+	.align	4
 #else
-	.p2align	3
+	.p2align	4
 #endif /* __APPLE__ */
 L_SHA3_transform_crypto_r:
-	.xword	0x0000000000000001
-	.xword	0x0000000000008082
-	.xword	0x800000000000808a
-	.xword	0x8000000080008000
-	.xword	0x000000000000808b
-	.xword	0x0000000080000001
-	.xword	0x8000000080008081
-	.xword	0x8000000000008009
-	.xword	0x000000000000008a
-	.xword	0x0000000000000088
-	.xword	0x0000000080008009
-	.xword	0x000000008000000a
-	.xword	0x000000008000808b
-	.xword	0x800000000000008b
-	.xword	0x8000000000008089
-	.xword	0x8000000000008003
-	.xword	0x8000000000008002
-	.xword	0x8000000000000080
-	.xword	0x000000000000800a
-	.xword	0x800000008000000a
-	.xword	0x8000000080008081
-	.xword	0x8000000000008080
-	.xword	0x0000000080000001
-	.xword	0x8000000080008008
+	.quad	0x0000000000000001,0x0000000000008082
+	.quad	0x800000000000808a,0x8000000080008000
+	.quad	0x000000000000808b,0x0000000080000001
+	.quad	0x8000000080008081,0x8000000000008009
+	.quad	0x000000000000008a,0x0000000000000088
+	.quad	0x0000000080008009,0x000000008000000a
+	.quad	0x000000008000808b,0x800000000000008b
+	.quad	0x8000000000008089,0x8000000000008003
+	.quad	0x8000000000008002,0x8000000000000080
+	.quad	0x000000000000800a,0x800000008000000a
+	.quad	0x8000000080008081,0x8000000000008080
+	.quad	0x0000000080000001,0x8000000080008008
 #ifndef __APPLE__
 .text
 .globl	BlockSha3_crypto
@@ -207,42 +196,31 @@ L_sha3_crypto_begin:
 #endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */
 #ifndef __APPLE__
 	.text
-	.type	L_SHA3_transform_base_r, %object
 	.section	.rodata
+	.type	L_SHA3_transform_base_r, %object
 	.size	L_SHA3_transform_base_r, 192
 #else
 	.section	__DATA,__data
 #endif /* __APPLE__ */
+	# 16-byte aligned, 128-bit aligned
 #ifndef __APPLE__
-	.align	3
+	.align	4
 #else
-	.p2align	3
+	.p2align	4
 #endif /* __APPLE__ */
 L_SHA3_transform_base_r:
-	.xword	0x0000000000000001
-	.xword	0x0000000000008082
-	.xword	0x800000000000808a
-	.xword	0x8000000080008000
-	.xword	0x000000000000808b
-	.xword	0x0000000080000001
-	.xword	0x8000000080008081
-	.xword	0x8000000000008009
-	.xword	0x000000000000008a
-	.xword	0x0000000000000088
-	.xword	0x0000000080008009
-	.xword	0x000000008000000a
-	.xword	0x000000008000808b
-	.xword	0x800000000000008b
-	.xword	0x8000000000008089
-	.xword	0x8000000000008003
-	.xword	0x8000000000008002
-	.xword	0x8000000000000080
-	.xword	0x000000000000800a
-	.xword	0x800000008000000a
-	.xword	0x8000000080008081
-	.xword	0x8000000000008080
-	.xword	0x0000000080000001
-	.xword	0x8000000080008008
+	.quad	0x0000000000000001,0x0000000000008082
+	.quad	0x800000000000808a,0x8000000080008000
+	.quad	0x000000000000808b,0x0000000080000001
+	.quad	0x8000000080008081,0x8000000000008009
+	.quad	0x000000000000008a,0x0000000000000088
+	.quad	0x0000000080008009,0x000000008000000a
+	.quad	0x000000008000808b,0x800000000000008b
+	.quad	0x8000000000008089,0x8000000000008003
+	.quad	0x8000000000008002,0x8000000000000080
+	.quad	0x000000000000800a,0x800000008000000a
+	.quad	0x8000000080008081,0x8000000000008080
+	.quad	0x0000000080000001,0x8000000080008008
 #ifndef __APPLE__
 .text
 .globl	BlockSha3_base
diff --git a/wolfcrypt/src/port/arm/armv8-sha3-asm_c.c b/wolfcrypt/src/port/arm/armv8-sha3-asm_c.c
index 253053f29d5..341cddee957 100644
--- a/wolfcrypt/src/port/arm/armv8-sha3-asm_c.c
+++ b/wolfcrypt/src/port/arm/armv8-sha3-asm_c.c
@@ -34,19 +34,19 @@
 
 #ifdef WOLFSSL_SHA3
 #ifdef WOLFSSL_ARMASM_CRYPTO_SHA3
-static const word64 L_SHA3_transform_crypto_r[] = {
-    0x0000000000000001, 0x0000000000008082,
-    0x800000000000808a, 0x8000000080008000,
-    0x000000000000808b, 0x0000000080000001,
-    0x8000000080008081, 0x8000000000008009,
-    0x000000000000008a, 0x0000000000000088,
-    0x0000000080008009, 0x000000008000000a,
-    0x000000008000808b, 0x800000000000008b,
-    0x8000000000008089, 0x8000000000008003,
-    0x8000000000008002, 0x8000000000000080,
-    0x000000000000800a, 0x800000008000000a,
-    0x8000000080008081, 0x8000000000008080,
-    0x0000000080000001, 0x8000000080008008,
+XALIGNED(16) static const word64 L_SHA3_transform_crypto_r[] = {
+    0x0000000000000001UL, 0x0000000000008082UL,
+    0x800000000000808aUL, 0x8000000080008000UL,
+    0x000000000000808bUL, 0x0000000080000001UL,
+    0x8000000080008081UL, 0x8000000000008009UL,
+    0x000000000000008aUL, 0x0000000000000088UL,
+    0x0000000080008009UL, 0x000000008000000aUL,
+    0x000000008000808bUL, 0x800000000000008bUL,
+    0x8000000000008089UL, 0x8000000000008003UL,
+    0x8000000000008002UL, 0x8000000000000080UL,
+    0x000000000000800aUL, 0x800000008000000aUL,
+    0x8000000080008081UL, 0x8000000000008080UL,
+    0x0000000080000001UL, 0x8000000080008008UL,
 };
 
 void BlockSha3_crypto(word64* state)
@@ -67,7 +67,7 @@ void BlockSha3_crypto(word64* state)
         "mov	x2, #24\n\t"
         /* Start of 24 rounds */
         "\n"
-    "L_sha3_crypto_begin_%=: \n\t"
+    "L_sha3_crypto_begin_%=:\n\t"
         /* Col Mix */
         "eor3	v31.16b, v0.16b, v5.16b, v10.16b\n\t"
         "eor3	v27.16b, v1.16b, v6.16b, v11.16b\n\t"
@@ -165,19 +165,19 @@ void BlockSha3_crypto(word64* state)
 }
 
 #endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */
-static const word64 L_SHA3_transform_base_r[] = {
-    0x0000000000000001, 0x0000000000008082,
-    0x800000000000808a, 0x8000000080008000,
-    0x000000000000808b, 0x0000000080000001,
-    0x8000000080008081, 0x8000000000008009,
-    0x000000000000008a, 0x0000000000000088,
-    0x0000000080008009, 0x000000008000000a,
-    0x000000008000808b, 0x800000000000008b,
-    0x8000000000008089, 0x8000000000008003,
-    0x8000000000008002, 0x8000000000000080,
-    0x000000000000800a, 0x800000008000000a,
-    0x8000000080008081, 0x8000000000008080,
-    0x0000000080000001, 0x8000000080008008,
+XALIGNED(16) static const word64 L_SHA3_transform_base_r[] = {
+    0x0000000000000001UL, 0x0000000000008082UL,
+    0x800000000000808aUL, 0x8000000080008000UL,
+    0x000000000000808bUL, 0x0000000080000001UL,
+    0x8000000080008081UL, 0x8000000000008009UL,
+    0x000000000000008aUL, 0x0000000000000088UL,
+    0x0000000080008009UL, 0x000000008000000aUL,
+    0x000000008000808bUL, 0x800000000000008bUL,
+    0x8000000000008089UL, 0x8000000000008003UL,
+    0x8000000000008002UL, 0x8000000000000080UL,
+    0x000000000000800aUL, 0x800000008000000aUL,
+    0x8000000080008081UL, 0x8000000000008080UL,
+    0x0000000080000001UL, 0x8000000080008008UL,
 };
 
 void BlockSha3_base(word64* state)
@@ -203,7 +203,7 @@ void BlockSha3_base(word64* state)
         "mov	x28, #24\n\t"
         /* Start of 24 rounds */
         "\n"
-    "L_SHA3_transform_base_begin_%=: \n\t"
+    "L_SHA3_transform_base_begin_%=:\n\t"
         "stp	%[r], x28, [x29, #48]\n\t"
         "eor	%x[state], x5, x10\n\t"
         "eor	x30, x1, x6\n\t"
diff --git a/wolfcrypt/src/port/arm/armv8-sha512-asm.S b/wolfcrypt/src/port/arm/armv8-sha512-asm.S
index 92cebfda8d0..1a48cf73177 100644
--- a/wolfcrypt/src/port/arm/armv8-sha512-asm.S
+++ b/wolfcrypt/src/port/arm/armv8-sha512-asm.S
@@ -32,113 +32,75 @@
 #if defined(WOLFSSL_SHA512) || defined(WOLFSSL_SHA384)
 #ifndef __APPLE__
 	.text
-	.type	L_SHA512_transform_neon_len_k, %object
 	.section	.rodata
+	.type	L_SHA512_transform_neon_len_k, %object
 	.size	L_SHA512_transform_neon_len_k, 640
 #else
 	.section	__DATA,__data
 #endif /* __APPLE__ */
+	# 16-byte aligned, 128-bit aligned
 #ifndef __APPLE__
-	.align	3
+	.align	4
 #else
-	.p2align	3
+	.p2align	4
 #endif /* __APPLE__ */
 L_SHA512_transform_neon_len_k:
-	.xword	0x428a2f98d728ae22
-	.xword	0x7137449123ef65cd
-	.xword	0xb5c0fbcfec4d3b2f
-	.xword	0xe9b5dba58189dbbc
-	.xword	0x3956c25bf348b538
-	.xword	0x59f111f1b605d019
-	.xword	0x923f82a4af194f9b
-	.xword	0xab1c5ed5da6d8118
-	.xword	0xd807aa98a3030242
-	.xword	0x12835b0145706fbe
-	.xword	0x243185be4ee4b28c
-	.xword	0x550c7dc3d5ffb4e2
-	.xword	0x72be5d74f27b896f
-	.xword	0x80deb1fe3b1696b1
-	.xword	0x9bdc06a725c71235
-	.xword	0xc19bf174cf692694
-	.xword	0xe49b69c19ef14ad2
-	.xword	0xefbe4786384f25e3
-	.xword	0x0fc19dc68b8cd5b5
-	.xword	0x240ca1cc77ac9c65
-	.xword	0x2de92c6f592b0275
-	.xword	0x4a7484aa6ea6e483
-	.xword	0x5cb0a9dcbd41fbd4
-	.xword	0x76f988da831153b5
-	.xword	0x983e5152ee66dfab
-	.xword	0xa831c66d2db43210
-	.xword	0xb00327c898fb213f
-	.xword	0xbf597fc7beef0ee4
-	.xword	0xc6e00bf33da88fc2
-	.xword	0xd5a79147930aa725
-	.xword	0x06ca6351e003826f
-	.xword	0x142929670a0e6e70
-	.xword	0x27b70a8546d22ffc
-	.xword	0x2e1b21385c26c926
-	.xword	0x4d2c6dfc5ac42aed
-	.xword	0x53380d139d95b3df
-	.xword	0x650a73548baf63de
-	.xword	0x766a0abb3c77b2a8
-	.xword	0x81c2c92e47edaee6
-	.xword	0x92722c851482353b
-	.xword	0xa2bfe8a14cf10364
-	.xword	0xa81a664bbc423001
-	.xword	0xc24b8b70d0f89791
-	.xword	0xc76c51a30654be30
-	.xword	0xd192e819d6ef5218
-	.xword	0xd69906245565a910
-	.xword	0xf40e35855771202a
-	.xword	0x106aa07032bbd1b8
-	.xword	0x19a4c116b8d2d0c8
-	.xword	0x1e376c085141ab53
-	.xword	0x2748774cdf8eeb99
-	.xword	0x34b0bcb5e19b48a8
-	.xword	0x391c0cb3c5c95a63
-	.xword	0x4ed8aa4ae3418acb
-	.xword	0x5b9cca4f7763e373
-	.xword	0x682e6ff3d6b2b8a3
-	.xword	0x748f82ee5defb2fc
-	.xword	0x78a5636f43172f60
-	.xword	0x84c87814a1f0ab72
-	.xword	0x8cc702081a6439ec
-	.xword	0x90befffa23631e28
-	.xword	0xa4506cebde82bde9
-	.xword	0xbef9a3f7b2c67915
-	.xword	0xc67178f2e372532b
-	.xword	0xca273eceea26619c
-	.xword	0xd186b8c721c0c207
-	.xword	0xeada7dd6cde0eb1e
-	.xword	0xf57d4f7fee6ed178
-	.xword	0x06f067aa72176fba
-	.xword	0x0a637dc5a2c898a6
-	.xword	0x113f9804bef90dae
-	.xword	0x1b710b35131c471b
-	.xword	0x28db77f523047d84
-	.xword	0x32caab7b40c72493
-	.xword	0x3c9ebe0a15c9bebc
-	.xword	0x431d67c49c100d4c
-	.xword	0x4cc5d4becb3e42b6
-	.xword	0x597f299cfc657e2a
-	.xword	0x5fcb6fab3ad6faec
-	.xword	0x6c44198c4a475817
+	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
+	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+	.quad	0x3956c25bf348b538,0x59f111f1b605d019
+	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
+	.quad	0xd807aa98a3030242,0x12835b0145706fbe
+	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
+	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
+	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
+	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
+	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
+	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
+	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
+	.quad	0x06ca6351e003826f,0x142929670a0e6e70
+	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
+	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
+	.quad	0x81c2c92e47edaee6,0x92722c851482353b
+	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
+	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
+	.quad	0xd192e819d6ef5218,0xd69906245565a910
+	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
+	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
+	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
+	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
+	.quad	0x90befffa23631e28,0xa4506cebde82bde9
+	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
+	.quad	0xca273eceea26619c,0xd186b8c721c0c207
+	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
+	.quad	0x113f9804bef90dae,0x1b710b35131c471b
+	.quad	0x28db77f523047d84,0x32caab7b40c72493
+	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
 #ifndef __APPLE__
 	.text
-	.type	L_SHA512_transform_neon_len_r8, %object
 	.section	.rodata
+	.type	L_SHA512_transform_neon_len_r8, %object
 	.size	L_SHA512_transform_neon_len_r8, 16
 #else
 	.section	__DATA,__data
 #endif /* __APPLE__ */
+	# 16-byte aligned, 128-bit aligned
 #ifndef __APPLE__
 	.align	4
 #else
 	.p2align	4
 #endif /* __APPLE__ */
 L_SHA512_transform_neon_len_r8:
-	.xword	0x7060504030201, 0x80f0e0d0c0b0a09
+	.quad	0x0007060504030201,0x080f0e0d0c0b0a09
 #ifndef __APPLE__
 .text
 .globl	Transform_Sha512_Len_neon
@@ -1087,98 +1049,59 @@ L_sha512_len_neon_start:
 #ifdef WOLFSSL_ARMASM_CRYPTO_SHA512
 #ifndef __APPLE__
 	.text
-	.type	L_SHA512_trans_crypto_len_k, %object
 	.section	.rodata
+	.type	L_SHA512_trans_crypto_len_k, %object
 	.size	L_SHA512_trans_crypto_len_k, 640
 #else
 	.section	__DATA,__data
 #endif /* __APPLE__ */
+	# 16-byte aligned, 128-bit aligned
 #ifndef __APPLE__
-	.align	3
+	.align	4
 #else
-	.p2align	3
+	.p2align	4
 #endif /* __APPLE__ */
 L_SHA512_trans_crypto_len_k:
-	.xword	0x428a2f98d728ae22
-	.xword	0x7137449123ef65cd
-	.xword	0xb5c0fbcfec4d3b2f
-	.xword	0xe9b5dba58189dbbc
-	.xword	0x3956c25bf348b538
-	.xword	0x59f111f1b605d019
-	.xword	0x923f82a4af194f9b
-	.xword	0xab1c5ed5da6d8118
-	.xword	0xd807aa98a3030242
-	.xword	0x12835b0145706fbe
-	.xword	0x243185be4ee4b28c
-	.xword	0x550c7dc3d5ffb4e2
-	.xword	0x72be5d74f27b896f
-	.xword	0x80deb1fe3b1696b1
-	.xword	0x9bdc06a725c71235
-	.xword	0xc19bf174cf692694
-	.xword	0xe49b69c19ef14ad2
-	.xword	0xefbe4786384f25e3
-	.xword	0x0fc19dc68b8cd5b5
-	.xword	0x240ca1cc77ac9c65
-	.xword	0x2de92c6f592b0275
-	.xword	0x4a7484aa6ea6e483
-	.xword	0x5cb0a9dcbd41fbd4
-	.xword	0x76f988da831153b5
-	.xword	0x983e5152ee66dfab
-	.xword	0xa831c66d2db43210
-	.xword	0xb00327c898fb213f
-	.xword	0xbf597fc7beef0ee4
-	.xword	0xc6e00bf33da88fc2
-	.xword	0xd5a79147930aa725
-	.xword	0x06ca6351e003826f
-	.xword	0x142929670a0e6e70
-	.xword	0x27b70a8546d22ffc
-	.xword	0x2e1b21385c26c926
-	.xword	0x4d2c6dfc5ac42aed
-	.xword	0x53380d139d95b3df
-	.xword	0x650a73548baf63de
-	.xword	0x766a0abb3c77b2a8
-	.xword	0x81c2c92e47edaee6
-	.xword	0x92722c851482353b
-	.xword	0xa2bfe8a14cf10364
-	.xword	0xa81a664bbc423001
-	.xword	0xc24b8b70d0f89791
-	.xword	0xc76c51a30654be30
-	.xword	0xd192e819d6ef5218
-	.xword	0xd69906245565a910
-	.xword	0xf40e35855771202a
-	.xword	0x106aa07032bbd1b8
-	.xword	0x19a4c116b8d2d0c8
-	.xword	0x1e376c085141ab53
-	.xword	0x2748774cdf8eeb99
-	.xword	0x34b0bcb5e19b48a8
-	.xword	0x391c0cb3c5c95a63
-	.xword	0x4ed8aa4ae3418acb
-	.xword	0x5b9cca4f7763e373
-	.xword	0x682e6ff3d6b2b8a3
-	.xword	0x748f82ee5defb2fc
-	.xword	0x78a5636f43172f60
-	.xword	0x84c87814a1f0ab72
-	.xword	0x8cc702081a6439ec
-	.xword	0x90befffa23631e28
-	.xword	0xa4506cebde82bde9
-	.xword	0xbef9a3f7b2c67915
-	.xword	0xc67178f2e372532b
-	.xword	0xca273eceea26619c
-	.xword	0xd186b8c721c0c207
-	.xword	0xeada7dd6cde0eb1e
-	.xword	0xf57d4f7fee6ed178
-	.xword	0x06f067aa72176fba
-	.xword	0x0a637dc5a2c898a6
-	.xword	0x113f9804bef90dae
-	.xword	0x1b710b35131c471b
-	.xword	0x28db77f523047d84
-	.xword	0x32caab7b40c72493
-	.xword	0x3c9ebe0a15c9bebc
-	.xword	0x431d67c49c100d4c
-	.xword	0x4cc5d4becb3e42b6
-	.xword	0x597f299cfc657e2a
-	.xword	0x5fcb6fab3ad6faec
-	.xword	0x6c44198c4a475817
+	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
+	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+	.quad	0x3956c25bf348b538,0x59f111f1b605d019
+	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
+	.quad	0xd807aa98a3030242,0x12835b0145706fbe
+	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
+	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
+	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
+	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
+	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
+	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
+	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
+	.quad	0x06ca6351e003826f,0x142929670a0e6e70
+	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
+	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
+	.quad	0x81c2c92e47edaee6,0x92722c851482353b
+	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
+	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
+	.quad	0xd192e819d6ef5218,0xd69906245565a910
+	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
+	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
+	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
+	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
+	.quad	0x90befffa23631e28,0xa4506cebde82bde9
+	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
+	.quad	0xca273eceea26619c,0xd186b8c721c0c207
+	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
+	.quad	0x113f9804bef90dae,0x1b710b35131c471b
+	.quad	0x28db77f523047d84,0x32caab7b40c72493
+	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
 #ifndef __APPLE__
 .text
 .globl	Transform_Sha512_Len_crypto
diff --git a/wolfcrypt/src/port/arm/armv8-sha512-asm_c.c b/wolfcrypt/src/port/arm/armv8-sha512-asm_c.c
index d637a737a2b..b7b6ec773d5 100644
--- a/wolfcrypt/src/port/arm/armv8-sha512-asm_c.c
+++ b/wolfcrypt/src/port/arm/armv8-sha512-asm_c.c
@@ -33,51 +33,51 @@
 #include <wolfssl/wolfcrypt/sha512.h>
 
 #if defined(WOLFSSL_SHA512) || defined(WOLFSSL_SHA384)
-static const word64 L_SHA512_transform_neon_len_k[] = {
-    0x428a2f98d728ae22, 0x7137449123ef65cd,
-    0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc,
-    0x3956c25bf348b538, 0x59f111f1b605d019,
-    0x923f82a4af194f9b, 0xab1c5ed5da6d8118,
-    0xd807aa98a3030242, 0x12835b0145706fbe,
-    0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2,
-    0x72be5d74f27b896f, 0x80deb1fe3b1696b1,
-    0x9bdc06a725c71235, 0xc19bf174cf692694,
-    0xe49b69c19ef14ad2, 0xefbe4786384f25e3,
-    0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65,
-    0x2de92c6f592b0275, 0x4a7484aa6ea6e483,
-    0x5cb0a9dcbd41fbd4, 0x76f988da831153b5,
-    0x983e5152ee66dfab, 0xa831c66d2db43210,
-    0xb00327c898fb213f, 0xbf597fc7beef0ee4,
-    0xc6e00bf33da88fc2, 0xd5a79147930aa725,
-    0x06ca6351e003826f, 0x142929670a0e6e70,
-    0x27b70a8546d22ffc, 0x2e1b21385c26c926,
-    0x4d2c6dfc5ac42aed, 0x53380d139d95b3df,
-    0x650a73548baf63de, 0x766a0abb3c77b2a8,
-    0x81c2c92e47edaee6, 0x92722c851482353b,
-    0xa2bfe8a14cf10364, 0xa81a664bbc423001,
-    0xc24b8b70d0f89791, 0xc76c51a30654be30,
-    0xd192e819d6ef5218, 0xd69906245565a910,
-    0xf40e35855771202a, 0x106aa07032bbd1b8,
-    0x19a4c116b8d2d0c8, 0x1e376c085141ab53,
-    0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8,
-    0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb,
-    0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3,
-    0x748f82ee5defb2fc, 0x78a5636f43172f60,
-    0x84c87814a1f0ab72, 0x8cc702081a6439ec,
-    0x90befffa23631e28, 0xa4506cebde82bde9,
-    0xbef9a3f7b2c67915, 0xc67178f2e372532b,
-    0xca273eceea26619c, 0xd186b8c721c0c207,
-    0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178,
-    0x06f067aa72176fba, 0x0a637dc5a2c898a6,
-    0x113f9804bef90dae, 0x1b710b35131c471b,
-    0x28db77f523047d84, 0x32caab7b40c72493,
-    0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c,
-    0x4cc5d4becb3e42b6, 0x597f299cfc657e2a,
-    0x5fcb6fab3ad6faec, 0x6c44198c4a475817,
+XALIGNED(16) static const word64 L_SHA512_transform_neon_len_k[] = {
+    0x428a2f98d728ae22UL, 0x7137449123ef65cdUL,
+    0xb5c0fbcfec4d3b2fUL, 0xe9b5dba58189dbbcUL,
+    0x3956c25bf348b538UL, 0x59f111f1b605d019UL,
+    0x923f82a4af194f9bUL, 0xab1c5ed5da6d8118UL,
+    0xd807aa98a3030242UL, 0x12835b0145706fbeUL,
+    0x243185be4ee4b28cUL, 0x550c7dc3d5ffb4e2UL,
+    0x72be5d74f27b896fUL, 0x80deb1fe3b1696b1UL,
+    0x9bdc06a725c71235UL, 0xc19bf174cf692694UL,
+    0xe49b69c19ef14ad2UL, 0xefbe4786384f25e3UL,
+    0x0fc19dc68b8cd5b5UL, 0x240ca1cc77ac9c65UL,
+    0x2de92c6f592b0275UL, 0x4a7484aa6ea6e483UL,
+    0x5cb0a9dcbd41fbd4UL, 0x76f988da831153b5UL,
+    0x983e5152ee66dfabUL, 0xa831c66d2db43210UL,
+    0xb00327c898fb213fUL, 0xbf597fc7beef0ee4UL,
+    0xc6e00bf33da88fc2UL, 0xd5a79147930aa725UL,
+    0x06ca6351e003826fUL, 0x142929670a0e6e70UL,
+    0x27b70a8546d22ffcUL, 0x2e1b21385c26c926UL,
+    0x4d2c6dfc5ac42aedUL, 0x53380d139d95b3dfUL,
+    0x650a73548baf63deUL, 0x766a0abb3c77b2a8UL,
+    0x81c2c92e47edaee6UL, 0x92722c851482353bUL,
+    0xa2bfe8a14cf10364UL, 0xa81a664bbc423001UL,
+    0xc24b8b70d0f89791UL, 0xc76c51a30654be30UL,
+    0xd192e819d6ef5218UL, 0xd69906245565a910UL,
+    0xf40e35855771202aUL, 0x106aa07032bbd1b8UL,
+    0x19a4c116b8d2d0c8UL, 0x1e376c085141ab53UL,
+    0x2748774cdf8eeb99UL, 0x34b0bcb5e19b48a8UL,
+    0x391c0cb3c5c95a63UL, 0x4ed8aa4ae3418acbUL,
+    0x5b9cca4f7763e373UL, 0x682e6ff3d6b2b8a3UL,
+    0x748f82ee5defb2fcUL, 0x78a5636f43172f60UL,
+    0x84c87814a1f0ab72UL, 0x8cc702081a6439ecUL,
+    0x90befffa23631e28UL, 0xa4506cebde82bde9UL,
+    0xbef9a3f7b2c67915UL, 0xc67178f2e372532bUL,
+    0xca273eceea26619cUL, 0xd186b8c721c0c207UL,
+    0xeada7dd6cde0eb1eUL, 0xf57d4f7fee6ed178UL,
+    0x06f067aa72176fbaUL, 0x0a637dc5a2c898a6UL,
+    0x113f9804bef90daeUL, 0x1b710b35131c471bUL,
+    0x28db77f523047d84UL, 0x32caab7b40c72493UL,
+    0x3c9ebe0a15c9bebcUL, 0x431d67c49c100d4cUL,
+    0x4cc5d4becb3e42b6UL, 0x597f299cfc657e2aUL,
+    0x5fcb6fab3ad6faecUL, 0x6c44198c4a475817UL,
 };
 
-static const word64 L_SHA512_transform_neon_len_r8[] = {
-    0x0007060504030201, 0x080f0e0d0c0b0a09,
+XALIGNED(16) static const word64 L_SHA512_transform_neon_len_r8[] = {
+    0x0007060504030201UL, 0x080f0e0d0c0b0a09UL,
 };
 
 void Transform_Sha512_Len_neon(wc_Sha512* sha512, const byte* data, word32 len)
@@ -93,7 +93,7 @@ void Transform_Sha512_Len_neon(wc_Sha512* sha512, const byte* data, word32 len)
         "ldp	x10, x11, [%x[sha512], #48]\n\t"
         /* Start of loop processing a block */
         "\n"
-    "L_sha512_len_neon_begin_%=: \n\t"
+    "L_sha512_len_neon_begin_%=:\n\t"
         /* Load W */
         /* Copy digest to add in at end */
         "ld1	{v0.16b, v1.16b, v2.16b, v3.16b}, [%x[data]], #0x40\n\t"
@@ -119,7 +119,7 @@ void Transform_Sha512_Len_neon(wc_Sha512* sha512, const byte* data, word32 len)
         "mov	%[r8], #4\n\t"
         /* Start of 16 rounds */
         "\n"
-    "L_sha512_len_neon_start_%=: \n\t"
+    "L_sha512_len_neon_start_%=:\n\t"
         /* Round 0 */
         "mov	x13, v0.d[0]\n\t"
         "ldr	x15, [%[k]], #8\n\t"
@@ -995,47 +995,47 @@ void Transform_Sha512_Len_neon(wc_Sha512* sha512, const byte* data, word32 len)
 }
 
 #ifdef WOLFSSL_ARMASM_CRYPTO_SHA512
-static const word64 L_SHA512_trans_crypto_len_k[] = {
-    0x428a2f98d728ae22, 0x7137449123ef65cd,
-    0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc,
-    0x3956c25bf348b538, 0x59f111f1b605d019,
-    0x923f82a4af194f9b, 0xab1c5ed5da6d8118,
-    0xd807aa98a3030242, 0x12835b0145706fbe,
-    0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2,
-    0x72be5d74f27b896f, 0x80deb1fe3b1696b1,
-    0x9bdc06a725c71235, 0xc19bf174cf692694,
-    0xe49b69c19ef14ad2, 0xefbe4786384f25e3,
-    0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65,
-    0x2de92c6f592b0275, 0x4a7484aa6ea6e483,
-    0x5cb0a9dcbd41fbd4, 0x76f988da831153b5,
-    0x983e5152ee66dfab, 0xa831c66d2db43210,
-    0xb00327c898fb213f, 0xbf597fc7beef0ee4,
-    0xc6e00bf33da88fc2, 0xd5a79147930aa725,
-    0x06ca6351e003826f, 0x142929670a0e6e70,
-    0x27b70a8546d22ffc, 0x2e1b21385c26c926,
-    0x4d2c6dfc5ac42aed, 0x53380d139d95b3df,
-    0x650a73548baf63de, 0x766a0abb3c77b2a8,
-    0x81c2c92e47edaee6, 0x92722c851482353b,
-    0xa2bfe8a14cf10364, 0xa81a664bbc423001,
-    0xc24b8b70d0f89791, 0xc76c51a30654be30,
-    0xd192e819d6ef5218, 0xd69906245565a910,
-    0xf40e35855771202a, 0x106aa07032bbd1b8,
-    0x19a4c116b8d2d0c8, 0x1e376c085141ab53,
-    0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8,
-    0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb,
-    0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3,
-    0x748f82ee5defb2fc, 0x78a5636f43172f60,
-    0x84c87814a1f0ab72, 0x8cc702081a6439ec,
-    0x90befffa23631e28, 0xa4506cebde82bde9,
-    0xbef9a3f7b2c67915, 0xc67178f2e372532b,
-    0xca273eceea26619c, 0xd186b8c721c0c207,
-    0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178,
-    0x06f067aa72176fba, 0x0a637dc5a2c898a6,
-    0x113f9804bef90dae, 0x1b710b35131c471b,
-    0x28db77f523047d84, 0x32caab7b40c72493,
-    0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c,
-    0x4cc5d4becb3e42b6, 0x597f299cfc657e2a,
-    0x5fcb6fab3ad6faec, 0x6c44198c4a475817,
+XALIGNED(16) static const word64 L_SHA512_trans_crypto_len_k[] = {
+    0x428a2f98d728ae22UL, 0x7137449123ef65cdUL,
+    0xb5c0fbcfec4d3b2fUL, 0xe9b5dba58189dbbcUL,
+    0x3956c25bf348b538UL, 0x59f111f1b605d019UL,
+    0x923f82a4af194f9bUL, 0xab1c5ed5da6d8118UL,
+    0xd807aa98a3030242UL, 0x12835b0145706fbeUL,
+    0x243185be4ee4b28cUL, 0x550c7dc3d5ffb4e2UL,
+    0x72be5d74f27b896fUL, 0x80deb1fe3b1696b1UL,
+    0x9bdc06a725c71235UL, 0xc19bf174cf692694UL,
+    0xe49b69c19ef14ad2UL, 0xefbe4786384f25e3UL,
+    0x0fc19dc68b8cd5b5UL, 0x240ca1cc77ac9c65UL,
+    0x2de92c6f592b0275UL, 0x4a7484aa6ea6e483UL,
+    0x5cb0a9dcbd41fbd4UL, 0x76f988da831153b5UL,
+    0x983e5152ee66dfabUL, 0xa831c66d2db43210UL,
+    0xb00327c898fb213fUL, 0xbf597fc7beef0ee4UL,
+    0xc6e00bf33da88fc2UL, 0xd5a79147930aa725UL,
+    0x06ca6351e003826fUL, 0x142929670a0e6e70UL,
+    0x27b70a8546d22ffcUL, 0x2e1b21385c26c926UL,
+    0x4d2c6dfc5ac42aedUL, 0x53380d139d95b3dfUL,
+    0x650a73548baf63deUL, 0x766a0abb3c77b2a8UL,
+    0x81c2c92e47edaee6UL, 0x92722c851482353bUL,
+    0xa2bfe8a14cf10364UL, 0xa81a664bbc423001UL,
+    0xc24b8b70d0f89791UL, 0xc76c51a30654be30UL,
+    0xd192e819d6ef5218UL, 0xd69906245565a910UL,
+    0xf40e35855771202aUL, 0x106aa07032bbd1b8UL,
+    0x19a4c116b8d2d0c8UL, 0x1e376c085141ab53UL,
+    0x2748774cdf8eeb99UL, 0x34b0bcb5e19b48a8UL,
+    0x391c0cb3c5c95a63UL, 0x4ed8aa4ae3418acbUL,
+    0x5b9cca4f7763e373UL, 0x682e6ff3d6b2b8a3UL,
+    0x748f82ee5defb2fcUL, 0x78a5636f43172f60UL,
+    0x84c87814a1f0ab72UL, 0x8cc702081a6439ecUL,
+    0x90befffa23631e28UL, 0xa4506cebde82bde9UL,
+    0xbef9a3f7b2c67915UL, 0xc67178f2e372532bUL,
+    0xca273eceea26619cUL, 0xd186b8c721c0c207UL,
+    0xeada7dd6cde0eb1eUL, 0xf57d4f7fee6ed178UL,
+    0x06f067aa72176fbaUL, 0x0a637dc5a2c898a6UL,
+    0x113f9804bef90daeUL, 0x1b710b35131c471bUL,
+    0x28db77f523047d84UL, 0x32caab7b40c72493UL,
+    0x3c9ebe0a15c9bebcUL, 0x431d67c49c100d4cUL,
+    0x4cc5d4becb3e42b6UL, 0x597f299cfc657e2aUL,
+    0x5fcb6fab3ad6faecUL, 0x6c44198c4a475817UL,
 };
 
 void Transform_Sha512_Len_crypto(wc_Sha512* sha512, const byte* data,
@@ -1055,7 +1055,7 @@ void Transform_Sha512_Len_crypto(wc_Sha512* sha512, const byte* data,
         "ld1	{v24.2d, v25.2d, v26.2d, v27.2d}, [%x[sha512]]\n\t"
         /* Start of loop processing a block */
         "\n"
-    "L_sha512_len_crypto_begin_%=: \n\t"
+    "L_sha512_len_crypto_begin_%=:\n\t"
         "mov	x3, %[k]\n\t"
         /* Load W */
         "ld1	{v0.16b, v1.16b, v2.16b, v3.16b}, [%x[data]], #0x40\n\t"
diff --git a/wolfcrypt/src/port/arm/thumb2-aes-asm.S b/wolfcrypt/src/port/arm/thumb2-aes-asm.S
index ceea6793caf..2275959d2ad 100644
--- a/wolfcrypt/src/port/arm/thumb2-aes-asm.S
+++ b/wolfcrypt/src/port/arm/thumb2-aes-asm.S
@@ -34,551 +34,203 @@
 	.syntax unified
 #ifndef NO_AES
 #ifdef HAVE_AES_DECRYPT
+#ifndef __APPLE__
 	.text
 	.type	L_AES_Thumb2_td_data, %object
 	.size	L_AES_Thumb2_td_data, 1024
-	.align	4
+#else
+	.section	__DATA,__data
+#endif /* __APPLE__ */
+	/* 8-byte aligned, 64-bit aligned */
+#ifndef __APPLE__
+	.align	3
+#else
+	.p2align	3
+#endif /* __APPLE__ */
 L_AES_Thumb2_td_data:
-	.word	0x5051f4a7
-	.word	0x537e4165
-	.word	0xc31a17a4
-	.word	0x963a275e
-	.word	0xcb3bab6b
-	.word	0xf11f9d45
-	.word	0xabacfa58
-	.word	0x934be303
-	.word	0x552030fa
-	.word	0xf6ad766d
-	.word	0x9188cc76
-	.word	0x25f5024c
-	.word	0xfc4fe5d7
-	.word	0xd7c52acb
-	.word	0x80263544
-	.word	0x8fb562a3
-	.word	0x49deb15a
-	.word	0x6725ba1b
-	.word	0x9845ea0e
-	.word	0xe15dfec0
-	.word	0x2c32f75
-	.word	0x12814cf0
-	.word	0xa38d4697
-	.word	0xc66bd3f9
-	.word	0xe7038f5f
-	.word	0x9515929c
-	.word	0xebbf6d7a
-	.word	0xda955259
-	.word	0x2dd4be83
-	.word	0xd3587421
-	.word	0x2949e069
-	.word	0x448ec9c8
-	.word	0x6a75c289
-	.word	0x78f48e79
-	.word	0x6b99583e
-	.word	0xdd27b971
-	.word	0xb6bee14f
-	.word	0x17f088ad
-	.word	0x66c920ac
-	.word	0xb47dce3a
-	.word	0x1863df4a
-	.word	0x82e51a31
-	.word	0x60975133
-	.word	0x4562537f
-	.word	0xe0b16477
-	.word	0x84bb6bae
-	.word	0x1cfe81a0
-	.word	0x94f9082b
-	.word	0x58704868
-	.word	0x198f45fd
-	.word	0x8794de6c
-	.word	0xb7527bf8
-	.word	0x23ab73d3
-	.word	0xe2724b02
-	.word	0x57e31f8f
-	.word	0x2a6655ab
-	.word	0x7b2eb28
-	.word	0x32fb5c2
-	.word	0x9a86c57b
-	.word	0xa5d33708
-	.word	0xf2302887
-	.word	0xb223bfa5
-	.word	0xba02036a
-	.word	0x5ced1682
-	.word	0x2b8acf1c
-	.word	0x92a779b4
-	.word	0xf0f307f2
-	.word	0xa14e69e2
-	.word	0xcd65daf4
-	.word	0xd50605be
-	.word	0x1fd13462
-	.word	0x8ac4a6fe
-	.word	0x9d342e53
-	.word	0xa0a2f355
-	.word	0x32058ae1
-	.word	0x75a4f6eb
-	.word	0x390b83ec
-	.word	0xaa4060ef
-	.word	0x65e719f
-	.word	0x51bd6e10
-	.word	0xf93e218a
-	.word	0x3d96dd06
-	.word	0xaedd3e05
-	.word	0x464de6bd
-	.word	0xb591548d
-	.word	0x571c45d
-	.word	0x6f0406d4
-	.word	0xff605015
-	.word	0x241998fb
-	.word	0x97d6bde9
-	.word	0xcc894043
-	.word	0x7767d99e
-	.word	0xbdb0e842
-	.word	0x8807898b
-	.word	0x38e7195b
-	.word	0xdb79c8ee
-	.word	0x47a17c0a
-	.word	0xe97c420f
-	.word	0xc9f8841e
-	.word	0x0
-	.word	0x83098086
-	.word	0x48322bed
-	.word	0xac1e1170
-	.word	0x4e6c5a72
-	.word	0xfbfd0eff
-	.word	0x560f8538
-	.word	0x1e3daed5
-	.word	0x27362d39
-	.word	0x640a0fd9
-	.word	0x21685ca6
-	.word	0xd19b5b54
-	.word	0x3a24362e
-	.word	0xb10c0a67
-	.word	0xf9357e7
-	.word	0xd2b4ee96
-	.word	0x9e1b9b91
-	.word	0x4f80c0c5
-	.word	0xa261dc20
-	.word	0x695a774b
-	.word	0x161c121a
-	.word	0xae293ba
-	.word	0xe5c0a02a
-	.word	0x433c22e0
-	.word	0x1d121b17
-	.word	0xb0e090d
-	.word	0xadf28bc7
-	.word	0xb92db6a8
-	.word	0xc8141ea9
-	.word	0x8557f119
-	.word	0x4caf7507
-	.word	0xbbee99dd
-	.word	0xfda37f60
-	.word	0x9ff70126
-	.word	0xbc5c72f5
-	.word	0xc544663b
-	.word	0x345bfb7e
-	.word	0x768b4329
-	.word	0xdccb23c6
-	.word	0x68b6edfc
-	.word	0x63b8e4f1
-	.word	0xcad731dc
-	.word	0x10426385
-	.word	0x40139722
-	.word	0x2084c611
-	.word	0x7d854a24
-	.word	0xf8d2bb3d
-	.word	0x11aef932
-	.word	0x6dc729a1
-	.word	0x4b1d9e2f
-	.word	0xf3dcb230
-	.word	0xec0d8652
-	.word	0xd077c1e3
-	.word	0x6c2bb316
-	.word	0x99a970b9
-	.word	0xfa119448
-	.word	0x2247e964
-	.word	0xc4a8fc8c
-	.word	0x1aa0f03f
-	.word	0xd8567d2c
-	.word	0xef223390
-	.word	0xc787494e
-	.word	0xc1d938d1
-	.word	0xfe8ccaa2
-	.word	0x3698d40b
-	.word	0xcfa6f581
-	.word	0x28a57ade
-	.word	0x26dab78e
-	.word	0xa43fadbf
-	.word	0xe42c3a9d
-	.word	0xd507892
-	.word	0x9b6a5fcc
-	.word	0x62547e46
-	.word	0xc2f68d13
-	.word	0xe890d8b8
-	.word	0x5e2e39f7
-	.word	0xf582c3af
-	.word	0xbe9f5d80
-	.word	0x7c69d093
-	.word	0xa96fd52d
-	.word	0xb3cf2512
-	.word	0x3bc8ac99
-	.word	0xa710187d
-	.word	0x6ee89c63
-	.word	0x7bdb3bbb
-	.word	0x9cd2678
-	.word	0xf46e5918
-	.word	0x1ec9ab7
-	.word	0xa8834f9a
-	.word	0x65e6956e
-	.word	0x7eaaffe6
-	.word	0x821bccf
-	.word	0xe6ef15e8
-	.word	0xd9bae79b
-	.word	0xce4a6f36
-	.word	0xd4ea9f09
-	.word	0xd629b07c
-	.word	0xaf31a4b2
-	.word	0x312a3f23
-	.word	0x30c6a594
-	.word	0xc035a266
-	.word	0x37744ebc
-	.word	0xa6fc82ca
-	.word	0xb0e090d0
-	.word	0x1533a7d8
-	.word	0x4af10498
-	.word	0xf741ecda
-	.word	0xe7fcd50
-	.word	0x2f1791f6
-	.word	0x8d764dd6
-	.word	0x4d43efb0
-	.word	0x54ccaa4d
-	.word	0xdfe49604
-	.word	0xe39ed1b5
-	.word	0x1b4c6a88
-	.word	0xb8c12c1f
-	.word	0x7f466551
-	.word	0x49d5eea
-	.word	0x5d018c35
-	.word	0x73fa8774
-	.word	0x2efb0b41
-	.word	0x5ab3671d
-	.word	0x5292dbd2
-	.word	0x33e91056
-	.word	0x136dd647
-	.word	0x8c9ad761
-	.word	0x7a37a10c
-	.word	0x8e59f814
-	.word	0x89eb133c
-	.word	0xeecea927
-	.word	0x35b761c9
-	.word	0xede11ce5
-	.word	0x3c7a47b1
-	.word	0x599cd2df
-	.word	0x3f55f273
-	.word	0x791814ce
-	.word	0xbf73c737
-	.word	0xea53f7cd
-	.word	0x5b5ffdaa
-	.word	0x14df3d6f
-	.word	0x867844db
-	.word	0x81caaff3
-	.word	0x3eb968c4
-	.word	0x2c382434
-	.word	0x5fc2a340
-	.word	0x72161dc3
-	.word	0xcbce225
-	.word	0x8b283c49
-	.word	0x41ff0d95
-	.word	0x7139a801
-	.word	0xde080cb3
-	.word	0x9cd8b4e4
-	.word	0x906456c1
-	.word	0x617bcb84
-	.word	0x70d532b6
-	.word	0x74486c5c
-	.word	0x42d0b857
+	.long	0x5051f4a7,0x537e4165,0xc31a17a4,0x963a275e
+	.long	0xcb3bab6b,0xf11f9d45,0xabacfa58,0x934be303
+	.long	0x552030fa,0xf6ad766d,0x9188cc76,0x25f5024c
+	.long	0xfc4fe5d7,0xd7c52acb,0x80263544,0x8fb562a3
+	.long	0x49deb15a,0x6725ba1b,0x9845ea0e,0xe15dfec0
+	.long	0x02c32f75,0x12814cf0,0xa38d4697,0xc66bd3f9
+	.long	0xe7038f5f,0x9515929c,0xebbf6d7a,0xda955259
+	.long	0x2dd4be83,0xd3587421,0x2949e069,0x448ec9c8
+	.long	0x6a75c289,0x78f48e79,0x6b99583e,0xdd27b971
+	.long	0xb6bee14f,0x17f088ad,0x66c920ac,0xb47dce3a
+	.long	0x1863df4a,0x82e51a31,0x60975133,0x4562537f
+	.long	0xe0b16477,0x84bb6bae,0x1cfe81a0,0x94f9082b
+	.long	0x58704868,0x198f45fd,0x8794de6c,0xb7527bf8
+	.long	0x23ab73d3,0xe2724b02,0x57e31f8f,0x2a6655ab
+	.long	0x07b2eb28,0x032fb5c2,0x9a86c57b,0xa5d33708
+	.long	0xf2302887,0xb223bfa5,0xba02036a,0x5ced1682
+	.long	0x2b8acf1c,0x92a779b4,0xf0f307f2,0xa14e69e2
+	.long	0xcd65daf4,0xd50605be,0x1fd13462,0x8ac4a6fe
+	.long	0x9d342e53,0xa0a2f355,0x32058ae1,0x75a4f6eb
+	.long	0x390b83ec,0xaa4060ef,0x065e719f,0x51bd6e10
+	.long	0xf93e218a,0x3d96dd06,0xaedd3e05,0x464de6bd
+	.long	0xb591548d,0x0571c45d,0x6f0406d4,0xff605015
+	.long	0x241998fb,0x97d6bde9,0xcc894043,0x7767d99e
+	.long	0xbdb0e842,0x8807898b,0x38e7195b,0xdb79c8ee
+	.long	0x47a17c0a,0xe97c420f,0xc9f8841e,0x00000000
+	.long	0x83098086,0x48322bed,0xac1e1170,0x4e6c5a72
+	.long	0xfbfd0eff,0x560f8538,0x1e3daed5,0x27362d39
+	.long	0x640a0fd9,0x21685ca6,0xd19b5b54,0x3a24362e
+	.long	0xb10c0a67,0x0f9357e7,0xd2b4ee96,0x9e1b9b91
+	.long	0x4f80c0c5,0xa261dc20,0x695a774b,0x161c121a
+	.long	0x0ae293ba,0xe5c0a02a,0x433c22e0,0x1d121b17
+	.long	0x0b0e090d,0xadf28bc7,0xb92db6a8,0xc8141ea9
+	.long	0x8557f119,0x4caf7507,0xbbee99dd,0xfda37f60
+	.long	0x9ff70126,0xbc5c72f5,0xc544663b,0x345bfb7e
+	.long	0x768b4329,0xdccb23c6,0x68b6edfc,0x63b8e4f1
+	.long	0xcad731dc,0x10426385,0x40139722,0x2084c611
+	.long	0x7d854a24,0xf8d2bb3d,0x11aef932,0x6dc729a1
+	.long	0x4b1d9e2f,0xf3dcb230,0xec0d8652,0xd077c1e3
+	.long	0x6c2bb316,0x99a970b9,0xfa119448,0x2247e964
+	.long	0xc4a8fc8c,0x1aa0f03f,0xd8567d2c,0xef223390
+	.long	0xc787494e,0xc1d938d1,0xfe8ccaa2,0x3698d40b
+	.long	0xcfa6f581,0x28a57ade,0x26dab78e,0xa43fadbf
+	.long	0xe42c3a9d,0x0d507892,0x9b6a5fcc,0x62547e46
+	.long	0xc2f68d13,0xe890d8b8,0x5e2e39f7,0xf582c3af
+	.long	0xbe9f5d80,0x7c69d093,0xa96fd52d,0xb3cf2512
+	.long	0x3bc8ac99,0xa710187d,0x6ee89c63,0x7bdb3bbb
+	.long	0x09cd2678,0xf46e5918,0x01ec9ab7,0xa8834f9a
+	.long	0x65e6956e,0x7eaaffe6,0x0821bccf,0xe6ef15e8
+	.long	0xd9bae79b,0xce4a6f36,0xd4ea9f09,0xd629b07c
+	.long	0xaf31a4b2,0x312a3f23,0x30c6a594,0xc035a266
+	.long	0x37744ebc,0xa6fc82ca,0xb0e090d0,0x1533a7d8
+	.long	0x4af10498,0xf741ecda,0x0e7fcd50,0x2f1791f6
+	.long	0x8d764dd6,0x4d43efb0,0x54ccaa4d,0xdfe49604
+	.long	0xe39ed1b5,0x1b4c6a88,0xb8c12c1f,0x7f466551
+	.long	0x049d5eea,0x5d018c35,0x73fa8774,0x2efb0b41
+	.long	0x5ab3671d,0x5292dbd2,0x33e91056,0x136dd647
+	.long	0x8c9ad761,0x7a37a10c,0x8e59f814,0x89eb133c
+	.long	0xeecea927,0x35b761c9,0xede11ce5,0x3c7a47b1
+	.long	0x599cd2df,0x3f55f273,0x791814ce,0xbf73c737
+	.long	0xea53f7cd,0x5b5ffdaa,0x14df3d6f,0x867844db
+	.long	0x81caaff3,0x3eb968c4,0x2c382434,0x5fc2a340
+	.long	0x72161dc3,0x0cbce225,0x8b283c49,0x41ff0d95
+	.long	0x7139a801,0xde080cb3,0x9cd8b4e4,0x906456c1
+	.long	0x617bcb84,0x70d532b6,0x74486c5c,0x42d0b857
 #endif /* HAVE_AES_DECRYPT */
 #if defined(HAVE_AES_DECRYPT) || defined(HAVE_AES_CBC) || \
     defined(HAVE_AESCCM) || defined(HAVE_AESGCM) || \
     defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER)
+#ifndef __APPLE__
 	.text
 	.type	L_AES_Thumb2_te_data, %object
 	.size	L_AES_Thumb2_te_data, 1024
-	.align	4
+#else
+	.section	__DATA,__data
+#endif /* __APPLE__ */
+	/* 8-byte aligned, 64-bit aligned */
+#ifndef __APPLE__
+	.align	3
+#else
+	.p2align	3
+#endif /* __APPLE__ */
 L_AES_Thumb2_te_data:
-	.word	0xa5c66363
-	.word	0x84f87c7c
-	.word	0x99ee7777
-	.word	0x8df67b7b
-	.word	0xdfff2f2
-	.word	0xbdd66b6b
-	.word	0xb1de6f6f
-	.word	0x5491c5c5
-	.word	0x50603030
-	.word	0x3020101
-	.word	0xa9ce6767
-	.word	0x7d562b2b
-	.word	0x19e7fefe
-	.word	0x62b5d7d7
-	.word	0xe64dabab
-	.word	0x9aec7676
-	.word	0x458fcaca
-	.word	0x9d1f8282
-	.word	0x4089c9c9
-	.word	0x87fa7d7d
-	.word	0x15effafa
-	.word	0xebb25959
-	.word	0xc98e4747
-	.word	0xbfbf0f0
-	.word	0xec41adad
-	.word	0x67b3d4d4
-	.word	0xfd5fa2a2
-	.word	0xea45afaf
-	.word	0xbf239c9c
-	.word	0xf753a4a4
-	.word	0x96e47272
-	.word	0x5b9bc0c0
-	.word	0xc275b7b7
-	.word	0x1ce1fdfd
-	.word	0xae3d9393
-	.word	0x6a4c2626
-	.word	0x5a6c3636
-	.word	0x417e3f3f
-	.word	0x2f5f7f7
-	.word	0x4f83cccc
-	.word	0x5c683434
-	.word	0xf451a5a5
-	.word	0x34d1e5e5
-	.word	0x8f9f1f1
-	.word	0x93e27171
-	.word	0x73abd8d8
-	.word	0x53623131
-	.word	0x3f2a1515
-	.word	0xc080404
-	.word	0x5295c7c7
-	.word	0x65462323
-	.word	0x5e9dc3c3
-	.word	0x28301818
-	.word	0xa1379696
-	.word	0xf0a0505
-	.word	0xb52f9a9a
-	.word	0x90e0707
-	.word	0x36241212
-	.word	0x9b1b8080
-	.word	0x3ddfe2e2
-	.word	0x26cdebeb
-	.word	0x694e2727
-	.word	0xcd7fb2b2
-	.word	0x9fea7575
-	.word	0x1b120909
-	.word	0x9e1d8383
-	.word	0x74582c2c
-	.word	0x2e341a1a
-	.word	0x2d361b1b
-	.word	0xb2dc6e6e
-	.word	0xeeb45a5a
-	.word	0xfb5ba0a0
-	.word	0xf6a45252
-	.word	0x4d763b3b
-	.word	0x61b7d6d6
-	.word	0xce7db3b3
-	.word	0x7b522929
-	.word	0x3edde3e3
-	.word	0x715e2f2f
-	.word	0x97138484
-	.word	0xf5a65353
-	.word	0x68b9d1d1
-	.word	0x0
-	.word	0x2cc1eded
-	.word	0x60402020
-	.word	0x1fe3fcfc
-	.word	0xc879b1b1
-	.word	0xedb65b5b
-	.word	0xbed46a6a
-	.word	0x468dcbcb
-	.word	0xd967bebe
-	.word	0x4b723939
-	.word	0xde944a4a
-	.word	0xd4984c4c
-	.word	0xe8b05858
-	.word	0x4a85cfcf
-	.word	0x6bbbd0d0
-	.word	0x2ac5efef
-	.word	0xe54faaaa
-	.word	0x16edfbfb
-	.word	0xc5864343
-	.word	0xd79a4d4d
-	.word	0x55663333
-	.word	0x94118585
-	.word	0xcf8a4545
-	.word	0x10e9f9f9
-	.word	0x6040202
-	.word	0x81fe7f7f
-	.word	0xf0a05050
-	.word	0x44783c3c
-	.word	0xba259f9f
-	.word	0xe34ba8a8
-	.word	0xf3a25151
-	.word	0xfe5da3a3
-	.word	0xc0804040
-	.word	0x8a058f8f
-	.word	0xad3f9292
-	.word	0xbc219d9d
-	.word	0x48703838
-	.word	0x4f1f5f5
-	.word	0xdf63bcbc
-	.word	0xc177b6b6
-	.word	0x75afdada
-	.word	0x63422121
-	.word	0x30201010
-	.word	0x1ae5ffff
-	.word	0xefdf3f3
-	.word	0x6dbfd2d2
-	.word	0x4c81cdcd
-	.word	0x14180c0c
-	.word	0x35261313
-	.word	0x2fc3ecec
-	.word	0xe1be5f5f
-	.word	0xa2359797
-	.word	0xcc884444
-	.word	0x392e1717
-	.word	0x5793c4c4
-	.word	0xf255a7a7
-	.word	0x82fc7e7e
-	.word	0x477a3d3d
-	.word	0xacc86464
-	.word	0xe7ba5d5d
-	.word	0x2b321919
-	.word	0x95e67373
-	.word	0xa0c06060
-	.word	0x98198181
-	.word	0xd19e4f4f
-	.word	0x7fa3dcdc
-	.word	0x66442222
-	.word	0x7e542a2a
-	.word	0xab3b9090
-	.word	0x830b8888
-	.word	0xca8c4646
-	.word	0x29c7eeee
-	.word	0xd36bb8b8
-	.word	0x3c281414
-	.word	0x79a7dede
-	.word	0xe2bc5e5e
-	.word	0x1d160b0b
-	.word	0x76addbdb
-	.word	0x3bdbe0e0
-	.word	0x56643232
-	.word	0x4e743a3a
-	.word	0x1e140a0a
-	.word	0xdb924949
-	.word	0xa0c0606
-	.word	0x6c482424
-	.word	0xe4b85c5c
-	.word	0x5d9fc2c2
-	.word	0x6ebdd3d3
-	.word	0xef43acac
-	.word	0xa6c46262
-	.word	0xa8399191
-	.word	0xa4319595
-	.word	0x37d3e4e4
-	.word	0x8bf27979
-	.word	0x32d5e7e7
-	.word	0x438bc8c8
-	.word	0x596e3737
-	.word	0xb7da6d6d
-	.word	0x8c018d8d
-	.word	0x64b1d5d5
-	.word	0xd29c4e4e
-	.word	0xe049a9a9
-	.word	0xb4d86c6c
-	.word	0xfaac5656
-	.word	0x7f3f4f4
-	.word	0x25cfeaea
-	.word	0xafca6565
-	.word	0x8ef47a7a
-	.word	0xe947aeae
-	.word	0x18100808
-	.word	0xd56fbaba
-	.word	0x88f07878
-	.word	0x6f4a2525
-	.word	0x725c2e2e
-	.word	0x24381c1c
-	.word	0xf157a6a6
-	.word	0xc773b4b4
-	.word	0x5197c6c6
-	.word	0x23cbe8e8
-	.word	0x7ca1dddd
-	.word	0x9ce87474
-	.word	0x213e1f1f
-	.word	0xdd964b4b
-	.word	0xdc61bdbd
-	.word	0x860d8b8b
-	.word	0x850f8a8a
-	.word	0x90e07070
-	.word	0x427c3e3e
-	.word	0xc471b5b5
-	.word	0xaacc6666
-	.word	0xd8904848
-	.word	0x5060303
-	.word	0x1f7f6f6
-	.word	0x121c0e0e
-	.word	0xa3c26161
-	.word	0x5f6a3535
-	.word	0xf9ae5757
-	.word	0xd069b9b9
-	.word	0x91178686
-	.word	0x5899c1c1
-	.word	0x273a1d1d
-	.word	0xb9279e9e
-	.word	0x38d9e1e1
-	.word	0x13ebf8f8
-	.word	0xb32b9898
-	.word	0x33221111
-	.word	0xbbd26969
-	.word	0x70a9d9d9
-	.word	0x89078e8e
-	.word	0xa7339494
-	.word	0xb62d9b9b
-	.word	0x223c1e1e
-	.word	0x92158787
-	.word	0x20c9e9e9
-	.word	0x4987cece
-	.word	0xffaa5555
-	.word	0x78502828
-	.word	0x7aa5dfdf
-	.word	0x8f038c8c
-	.word	0xf859a1a1
-	.word	0x80098989
-	.word	0x171a0d0d
-	.word	0xda65bfbf
-	.word	0x31d7e6e6
-	.word	0xc6844242
-	.word	0xb8d06868
-	.word	0xc3824141
-	.word	0xb0299999
-	.word	0x775a2d2d
-	.word	0x111e0f0f
-	.word	0xcb7bb0b0
-	.word	0xfca85454
-	.word	0xd66dbbbb
-	.word	0x3a2c1616
+	.long	0xa5c66363,0x84f87c7c,0x99ee7777,0x8df67b7b
+	.long	0x0dfff2f2,0xbdd66b6b,0xb1de6f6f,0x5491c5c5
+	.long	0x50603030,0x03020101,0xa9ce6767,0x7d562b2b
+	.long	0x19e7fefe,0x62b5d7d7,0xe64dabab,0x9aec7676
+	.long	0x458fcaca,0x9d1f8282,0x4089c9c9,0x87fa7d7d
+	.long	0x15effafa,0xebb25959,0xc98e4747,0x0bfbf0f0
+	.long	0xec41adad,0x67b3d4d4,0xfd5fa2a2,0xea45afaf
+	.long	0xbf239c9c,0xf753a4a4,0x96e47272,0x5b9bc0c0
+	.long	0xc275b7b7,0x1ce1fdfd,0xae3d9393,0x6a4c2626
+	.long	0x5a6c3636,0x417e3f3f,0x02f5f7f7,0x4f83cccc
+	.long	0x5c683434,0xf451a5a5,0x34d1e5e5,0x08f9f1f1
+	.long	0x93e27171,0x73abd8d8,0x53623131,0x3f2a1515
+	.long	0x0c080404,0x5295c7c7,0x65462323,0x5e9dc3c3
+	.long	0x28301818,0xa1379696,0x0f0a0505,0xb52f9a9a
+	.long	0x090e0707,0x36241212,0x9b1b8080,0x3ddfe2e2
+	.long	0x26cdebeb,0x694e2727,0xcd7fb2b2,0x9fea7575
+	.long	0x1b120909,0x9e1d8383,0x74582c2c,0x2e341a1a
+	.long	0x2d361b1b,0xb2dc6e6e,0xeeb45a5a,0xfb5ba0a0
+	.long	0xf6a45252,0x4d763b3b,0x61b7d6d6,0xce7db3b3
+	.long	0x7b522929,0x3edde3e3,0x715e2f2f,0x97138484
+	.long	0xf5a65353,0x68b9d1d1,0x00000000,0x2cc1eded
+	.long	0x60402020,0x1fe3fcfc,0xc879b1b1,0xedb65b5b
+	.long	0xbed46a6a,0x468dcbcb,0xd967bebe,0x4b723939
+	.long	0xde944a4a,0xd4984c4c,0xe8b05858,0x4a85cfcf
+	.long	0x6bbbd0d0,0x2ac5efef,0xe54faaaa,0x16edfbfb
+	.long	0xc5864343,0xd79a4d4d,0x55663333,0x94118585
+	.long	0xcf8a4545,0x10e9f9f9,0x06040202,0x81fe7f7f
+	.long	0xf0a05050,0x44783c3c,0xba259f9f,0xe34ba8a8
+	.long	0xf3a25151,0xfe5da3a3,0xc0804040,0x8a058f8f
+	.long	0xad3f9292,0xbc219d9d,0x48703838,0x04f1f5f5
+	.long	0xdf63bcbc,0xc177b6b6,0x75afdada,0x63422121
+	.long	0x30201010,0x1ae5ffff,0x0efdf3f3,0x6dbfd2d2
+	.long	0x4c81cdcd,0x14180c0c,0x35261313,0x2fc3ecec
+	.long	0xe1be5f5f,0xa2359797,0xcc884444,0x392e1717
+	.long	0x5793c4c4,0xf255a7a7,0x82fc7e7e,0x477a3d3d
+	.long	0xacc86464,0xe7ba5d5d,0x2b321919,0x95e67373
+	.long	0xa0c06060,0x98198181,0xd19e4f4f,0x7fa3dcdc
+	.long	0x66442222,0x7e542a2a,0xab3b9090,0x830b8888
+	.long	0xca8c4646,0x29c7eeee,0xd36bb8b8,0x3c281414
+	.long	0x79a7dede,0xe2bc5e5e,0x1d160b0b,0x76addbdb
+	.long	0x3bdbe0e0,0x56643232,0x4e743a3a,0x1e140a0a
+	.long	0xdb924949,0x0a0c0606,0x6c482424,0xe4b85c5c
+	.long	0x5d9fc2c2,0x6ebdd3d3,0xef43acac,0xa6c46262
+	.long	0xa8399191,0xa4319595,0x37d3e4e4,0x8bf27979
+	.long	0x32d5e7e7,0x438bc8c8,0x596e3737,0xb7da6d6d
+	.long	0x8c018d8d,0x64b1d5d5,0xd29c4e4e,0xe049a9a9
+	.long	0xb4d86c6c,0xfaac5656,0x07f3f4f4,0x25cfeaea
+	.long	0xafca6565,0x8ef47a7a,0xe947aeae,0x18100808
+	.long	0xd56fbaba,0x88f07878,0x6f4a2525,0x725c2e2e
+	.long	0x24381c1c,0xf157a6a6,0xc773b4b4,0x5197c6c6
+	.long	0x23cbe8e8,0x7ca1dddd,0x9ce87474,0x213e1f1f
+	.long	0xdd964b4b,0xdc61bdbd,0x860d8b8b,0x850f8a8a
+	.long	0x90e07070,0x427c3e3e,0xc471b5b5,0xaacc6666
+	.long	0xd8904848,0x05060303,0x01f7f6f6,0x121c0e0e
+	.long	0xa3c26161,0x5f6a3535,0xf9ae5757,0xd069b9b9
+	.long	0x91178686,0x5899c1c1,0x273a1d1d,0xb9279e9e
+	.long	0x38d9e1e1,0x13ebf8f8,0xb32b9898,0x33221111
+	.long	0xbbd26969,0x70a9d9d9,0x89078e8e,0xa7339494
+	.long	0xb62d9b9b,0x223c1e1e,0x92158787,0x20c9e9e9
+	.long	0x4987cece,0xffaa5555,0x78502828,0x7aa5dfdf
+	.long	0x8f038c8c,0xf859a1a1,0x80098989,0x171a0d0d
+	.long	0xda65bfbf,0x31d7e6e6,0xc6844242,0xb8d06868
+	.long	0xc3824141,0xb0299999,0x775a2d2d,0x111e0f0f
+	.long	0xcb7bb0b0,0xfca85454,0xd66dbbbb,0x3a2c1616
 #endif /* HAVE_AES_DECRYPT || HAVE_AES_CBC || HAVE_AESCCM || HAVE_AESGCM ||
         * WOLFSSL_AES_DIRECT || WOLFSSL_AES_COUNTER */
 #ifdef HAVE_AES_DECRYPT
+#ifndef __APPLE__
 	.text
 	.type	L_AES_Thumb2_td, %object
 	.size	L_AES_Thumb2_td, 12
-	.align	4
+#else
+	.section	__DATA,__data
+#endif /* __APPLE__ */
+	/* 8-byte aligned, 64-bit aligned */
+#ifndef __APPLE__
+	.align	3
+#else
+	.p2align	3
+#endif /* __APPLE__ */
 L_AES_Thumb2_td:
-	.word	L_AES_Thumb2_td_data
+	.long	L_AES_Thumb2_td_data
 #endif /* HAVE_AES_DECRYPT */
 #if defined(HAVE_AES_DECRYPT) || defined(HAVE_AES_CBC) || \
     defined(HAVE_AESCCM) || defined(HAVE_AESGCM) || \
     defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER)
+#ifndef __APPLE__
 	.text
 	.type	L_AES_Thumb2_te, %object
 	.size	L_AES_Thumb2_te, 12
-	.align	4
+#else
+	.section	__DATA,__data
+#endif /* __APPLE__ */
+	/* 8-byte aligned, 64-bit aligned */
+#ifndef __APPLE__
+	.align	3
+#else
+	.p2align	3
+#endif /* __APPLE__ */
 L_AES_Thumb2_te:
-	.word	L_AES_Thumb2_te_data
+	.long	L_AES_Thumb2_te_data
 #endif /* HAVE_AES_DECRYPT || HAVE_AES_CBC || HAVE_AESCCM || HAVE_AESGCM ||
         * WOLFSSL_AES_DIRECT || WOLFSSL_AES_COUNTER */
 #ifdef HAVE_AES_DECRYPT
@@ -683,21 +335,23 @@ L_AES_invert_key_mix_loop:
 	/* Cycle Count = 165 */
 	.size	AES_invert_key,.-AES_invert_key
 #endif /* HAVE_AES_DECRYPT */
+#ifndef __APPLE__
 	.text
 	.type	L_AES_Thumb2_rcon, %object
 	.size	L_AES_Thumb2_rcon, 40
-	.align	4
+#else
+	.section	__DATA,__data
+#endif /* __APPLE__ */
+	/* 8-byte aligned, 64-bit aligned */
+#ifndef __APPLE__
+	.align	3
+#else
+	.p2align	3
+#endif /* __APPLE__ */
 L_AES_Thumb2_rcon:
-	.word	0x1000000
-	.word	0x2000000
-	.word	0x4000000
-	.word	0x8000000
-	.word	0x10000000
-	.word	0x20000000
-	.word	0x40000000
-	.word	0x80000000
-	.word	0x1b000000
-	.word	0x36000000
+	.long	0x01000000,0x02000000,0x04000000,0x08000000
+	.long	0x10000000,0x20000000,0x40000000,0x80000000
+	.long	0x1b000000,0x36000000
 	.text
 	.align	4
 	.globl	AES_set_encrypt_key
@@ -1142,12 +796,21 @@ L_AES_encrypt_block_nr:
 #endif /* !WOLFSSL_ARMASM_AES_BLOCK_INLINE */
 #if defined(HAVE_AES_CBC) || defined(HAVE_AESCCM) || defined(HAVE_AESGCM) || \
     defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER)
+#ifndef __APPLE__
 	.text
 	.type	L_AES_Thumb2_te_ecb, %object
 	.size	L_AES_Thumb2_te_ecb, 12
-	.align	4
+#else
+	.section	__DATA,__data
+#endif /* __APPLE__ */
+	/* 8-byte aligned, 64-bit aligned */
+#ifndef __APPLE__
+	.align	3
+#else
+	.p2align	3
+#endif /* __APPLE__ */
 L_AES_Thumb2_te_ecb:
-	.word	L_AES_Thumb2_te_data
+	.long	L_AES_Thumb2_te_data
 #endif /* HAVE_AES_CBC || HAVE_AESCCM || HAVE_AESGCM || WOLFSSL_AES_DIRECT ||
         * WOLFSSL_AES_COUNTER */
 #if defined(HAVE_AESCCM) || defined(HAVE_AESGCM) || \
@@ -2727,12 +2390,21 @@ L_AES_CBC_encrypt_end:
 	.size	AES_CBC_encrypt,.-AES_CBC_encrypt
 #endif /* HAVE_AES_CBC */
 #ifdef WOLFSSL_AES_COUNTER
+#ifndef __APPLE__
 	.text
 	.type	L_AES_Thumb2_te_ctr, %object
 	.size	L_AES_Thumb2_te_ctr, 12
-	.align	4
+#else
+	.section	__DATA,__data
+#endif /* __APPLE__ */
+	/* 8-byte aligned, 64-bit aligned */
+#ifndef __APPLE__
+	.align	3
+#else
+	.p2align	3
+#endif /* __APPLE__ */
 L_AES_Thumb2_te_ctr:
-	.word	L_AES_Thumb2_te_data
+	.long	L_AES_Thumb2_te_data
 	.text
 	.align	4
 	.globl	AES_CTR_encrypt
@@ -3768,273 +3440,67 @@ L_AES_decrypt_block_nr:
 	/* Cycle Count = 285 */
 	.size	AES_decrypt_block,.-AES_decrypt_block
 #endif /* !WOLFSSL_ARMASM_AES_BLOCK_INLINE */
+#ifndef __APPLE__
 	.text
 	.type	L_AES_Thumb2_td_ecb, %object
 	.size	L_AES_Thumb2_td_ecb, 12
-	.align	4
+#else
+	.section	__DATA,__data
+#endif /* __APPLE__ */
+	/* 8-byte aligned, 64-bit aligned */
+#ifndef __APPLE__
+	.align	3
+#else
+	.p2align	3
+#endif /* __APPLE__ */
 L_AES_Thumb2_td_ecb:
-	.word	L_AES_Thumb2_td_data
+	.long	L_AES_Thumb2_td_data
+#ifndef __APPLE__
 	.text
 	.type	L_AES_Thumb2_td4, %object
 	.size	L_AES_Thumb2_td4, 256
-	.align	4
+#else
+	.section	__DATA,__data
+#endif /* __APPLE__ */
+	/* 4-byte aligned, 32-bit aligned */
+#ifndef __APPLE__
+	.align	2
+#else
+	.p2align	2
+#endif /* __APPLE__ */
 L_AES_Thumb2_td4:
-	.byte	0x52
-	.byte	0x9
-	.byte	0x6a
-	.byte	0xd5
-	.byte	0x30
-	.byte	0x36
-	.byte	0xa5
-	.byte	0x38
-	.byte	0xbf
-	.byte	0x40
-	.byte	0xa3
-	.byte	0x9e
-	.byte	0x81
-	.byte	0xf3
-	.byte	0xd7
-	.byte	0xfb
-	.byte	0x7c
-	.byte	0xe3
-	.byte	0x39
-	.byte	0x82
-	.byte	0x9b
-	.byte	0x2f
-	.byte	0xff
-	.byte	0x87
-	.byte	0x34
-	.byte	0x8e
-	.byte	0x43
-	.byte	0x44
-	.byte	0xc4
-	.byte	0xde
-	.byte	0xe9
-	.byte	0xcb
-	.byte	0x54
-	.byte	0x7b
-	.byte	0x94
-	.byte	0x32
-	.byte	0xa6
-	.byte	0xc2
-	.byte	0x23
-	.byte	0x3d
-	.byte	0xee
-	.byte	0x4c
-	.byte	0x95
-	.byte	0xb
-	.byte	0x42
-	.byte	0xfa
-	.byte	0xc3
-	.byte	0x4e
-	.byte	0x8
-	.byte	0x2e
-	.byte	0xa1
-	.byte	0x66
-	.byte	0x28
-	.byte	0xd9
-	.byte	0x24
-	.byte	0xb2
-	.byte	0x76
-	.byte	0x5b
-	.byte	0xa2
-	.byte	0x49
-	.byte	0x6d
-	.byte	0x8b
-	.byte	0xd1
-	.byte	0x25
-	.byte	0x72
-	.byte	0xf8
-	.byte	0xf6
-	.byte	0x64
-	.byte	0x86
-	.byte	0x68
-	.byte	0x98
-	.byte	0x16
-	.byte	0xd4
-	.byte	0xa4
-	.byte	0x5c
-	.byte	0xcc
-	.byte	0x5d
-	.byte	0x65
-	.byte	0xb6
-	.byte	0x92
-	.byte	0x6c
-	.byte	0x70
-	.byte	0x48
-	.byte	0x50
-	.byte	0xfd
-	.byte	0xed
-	.byte	0xb9
-	.byte	0xda
-	.byte	0x5e
-	.byte	0x15
-	.byte	0x46
-	.byte	0x57
-	.byte	0xa7
-	.byte	0x8d
-	.byte	0x9d
-	.byte	0x84
-	.byte	0x90
-	.byte	0xd8
-	.byte	0xab
-	.byte	0x0
-	.byte	0x8c
-	.byte	0xbc
-	.byte	0xd3
-	.byte	0xa
-	.byte	0xf7
-	.byte	0xe4
-	.byte	0x58
-	.byte	0x5
-	.byte	0xb8
-	.byte	0xb3
-	.byte	0x45
-	.byte	0x6
-	.byte	0xd0
-	.byte	0x2c
-	.byte	0x1e
-	.byte	0x8f
-	.byte	0xca
-	.byte	0x3f
-	.byte	0xf
-	.byte	0x2
-	.byte	0xc1
-	.byte	0xaf
-	.byte	0xbd
-	.byte	0x3
-	.byte	0x1
-	.byte	0x13
-	.byte	0x8a
-	.byte	0x6b
-	.byte	0x3a
-	.byte	0x91
-	.byte	0x11
-	.byte	0x41
-	.byte	0x4f
-	.byte	0x67
-	.byte	0xdc
-	.byte	0xea
-	.byte	0x97
-	.byte	0xf2
-	.byte	0xcf
-	.byte	0xce
-	.byte	0xf0
-	.byte	0xb4
-	.byte	0xe6
-	.byte	0x73
-	.byte	0x96
-	.byte	0xac
-	.byte	0x74
-	.byte	0x22
-	.byte	0xe7
-	.byte	0xad
-	.byte	0x35
-	.byte	0x85
-	.byte	0xe2
-	.byte	0xf9
-	.byte	0x37
-	.byte	0xe8
-	.byte	0x1c
-	.byte	0x75
-	.byte	0xdf
-	.byte	0x6e
-	.byte	0x47
-	.byte	0xf1
-	.byte	0x1a
-	.byte	0x71
-	.byte	0x1d
-	.byte	0x29
-	.byte	0xc5
-	.byte	0x89
-	.byte	0x6f
-	.byte	0xb7
-	.byte	0x62
-	.byte	0xe
-	.byte	0xaa
-	.byte	0x18
-	.byte	0xbe
-	.byte	0x1b
-	.byte	0xfc
-	.byte	0x56
-	.byte	0x3e
-	.byte	0x4b
-	.byte	0xc6
-	.byte	0xd2
-	.byte	0x79
-	.byte	0x20
-	.byte	0x9a
-	.byte	0xdb
-	.byte	0xc0
-	.byte	0xfe
-	.byte	0x78
-	.byte	0xcd
-	.byte	0x5a
-	.byte	0xf4
-	.byte	0x1f
-	.byte	0xdd
-	.byte	0xa8
-	.byte	0x33
-	.byte	0x88
-	.byte	0x7
-	.byte	0xc7
-	.byte	0x31
-	.byte	0xb1
-	.byte	0x12
-	.byte	0x10
-	.byte	0x59
-	.byte	0x27
-	.byte	0x80
-	.byte	0xec
-	.byte	0x5f
-	.byte	0x60
-	.byte	0x51
-	.byte	0x7f
-	.byte	0xa9
-	.byte	0x19
-	.byte	0xb5
-	.byte	0x4a
-	.byte	0xd
-	.byte	0x2d
-	.byte	0xe5
-	.byte	0x7a
-	.byte	0x9f
-	.byte	0x93
-	.byte	0xc9
-	.byte	0x9c
-	.byte	0xef
-	.byte	0xa0
-	.byte	0xe0
-	.byte	0x3b
-	.byte	0x4d
-	.byte	0xae
-	.byte	0x2a
-	.byte	0xf5
-	.byte	0xb0
-	.byte	0xc8
-	.byte	0xeb
-	.byte	0xbb
-	.byte	0x3c
-	.byte	0x83
-	.byte	0x53
-	.byte	0x99
-	.byte	0x61
-	.byte	0x17
-	.byte	0x2b
-	.byte	0x4
-	.byte	0x7e
-	.byte	0xba
-	.byte	0x77
-	.byte	0xd6
-	.byte	0x26
-	.byte	0xe1
-	.byte	0x69
-	.byte	0x14
-	.byte	0x63
-	.byte	0x55
-	.byte	0x21
-	.byte	0xc
-	.byte	0x7d
+	.byte	0x52,0x09,0x6a,0xd5,0x30,0x36,0xa5,0x38
+	.byte	0xbf,0x40,0xa3,0x9e,0x81,0xf3,0xd7,0xfb
+	.byte	0x7c,0xe3,0x39,0x82,0x9b,0x2f,0xff,0x87
+	.byte	0x34,0x8e,0x43,0x44,0xc4,0xde,0xe9,0xcb
+	.byte	0x54,0x7b,0x94,0x32,0xa6,0xc2,0x23,0x3d
+	.byte	0xee,0x4c,0x95,0x0b,0x42,0xfa,0xc3,0x4e
+	.byte	0x08,0x2e,0xa1,0x66,0x28,0xd9,0x24,0xb2
+	.byte	0x76,0x5b,0xa2,0x49,0x6d,0x8b,0xd1,0x25
+	.byte	0x72,0xf8,0xf6,0x64,0x86,0x68,0x98,0x16
+	.byte	0xd4,0xa4,0x5c,0xcc,0x5d,0x65,0xb6,0x92
+	.byte	0x6c,0x70,0x48,0x50,0xfd,0xed,0xb9,0xda
+	.byte	0x5e,0x15,0x46,0x57,0xa7,0x8d,0x9d,0x84
+	.byte	0x90,0xd8,0xab,0x00,0x8c,0xbc,0xd3,0x0a
+	.byte	0xf7,0xe4,0x58,0x05,0xb8,0xb3,0x45,0x06
+	.byte	0xd0,0x2c,0x1e,0x8f,0xca,0x3f,0x0f,0x02
+	.byte	0xc1,0xaf,0xbd,0x03,0x01,0x13,0x8a,0x6b
+	.byte	0x3a,0x91,0x11,0x41,0x4f,0x67,0xdc,0xea
+	.byte	0x97,0xf2,0xcf,0xce,0xf0,0xb4,0xe6,0x73
+	.byte	0x96,0xac,0x74,0x22,0xe7,0xad,0x35,0x85
+	.byte	0xe2,0xf9,0x37,0xe8,0x1c,0x75,0xdf,0x6e
+	.byte	0x47,0xf1,0x1a,0x71,0x1d,0x29,0xc5,0x89
+	.byte	0x6f,0xb7,0x62,0x0e,0xaa,0x18,0xbe,0x1b
+	.byte	0xfc,0x56,0x3e,0x4b,0xc6,0xd2,0x79,0x20
+	.byte	0x9a,0xdb,0xc0,0xfe,0x78,0xcd,0x5a,0xf4
+	.byte	0x1f,0xdd,0xa8,0x33,0x88,0x07,0xc7,0x31
+	.byte	0xb1,0x12,0x10,0x59,0x27,0x80,0xec,0x5f
+	.byte	0x60,0x51,0x7f,0xa9,0x19,0xb5,0x4a,0x0d
+	.byte	0x2d,0xe5,0x7a,0x9f,0x93,0xc9,0x9c,0xef
+	.byte	0xa0,0xe0,0x3b,0x4d,0xae,0x2a,0xf5,0xb0
+	.byte	0xc8,0xeb,0xbb,0x3c,0x83,0x53,0x99,0x61
+	.byte	0x17,0x2b,0x04,0x7e,0xba,0x77,0xd6,0x26
+	.byte	0xe1,0x69,0x14,0x63,0x55,0x21,0x0c,0x7d
 #if defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) || defined(HAVE_AES_ECB)
 	.text
 	.align	4
@@ -6398,27 +5864,24 @@ L_AES_CBC_decrypt_end:
         * HAVE_AES_ECB */
 #endif /* HAVE_AES_DECRYPT */
 #ifdef HAVE_AESGCM
+#ifndef __APPLE__
 	.text
 	.type	L_GCM_gmult_len_r, %object
 	.size	L_GCM_gmult_len_r, 64
-	.align	4
+#else
+	.section	__DATA,__data
+#endif /* __APPLE__ */
+	/* 8-byte aligned, 64-bit aligned */
+#ifndef __APPLE__
+	.align	3
+#else
+	.p2align	3
+#endif /* __APPLE__ */
 L_GCM_gmult_len_r:
-	.word	0x0
-	.word	0x1c200000
-	.word	0x38400000
-	.word	0x24600000
-	.word	0x70800000
-	.word	0x6ca00000
-	.word	0x48c00000
-	.word	0x54e00000
-	.word	0xe1000000
-	.word	0xfd200000
-	.word	0xd9400000
-	.word	0xc5600000
-	.word	0x91800000
-	.word	0x8da00000
-	.word	0xa9c00000
-	.word	0xb5e00000
+	.long	0x00000000,0x1c200000,0x38400000,0x24600000
+	.long	0x70800000,0x6ca00000,0x48c00000,0x54e00000
+	.long	0xe1000000,0xfd200000,0xd9400000,0xc5600000
+	.long	0x91800000,0x8da00000,0xa9c00000,0xb5e00000
 	.text
 	.align	4
 	.globl	GCM_gmult_len
@@ -6979,12 +6442,21 @@ L_GCM_gmult_len_start_block:
 	POP	{r4, r5, r6, r7, r8, r9, r10, r11, pc}
 	/* Cycle Count = 742 */
 	.size	GCM_gmult_len,.-GCM_gmult_len
+#ifndef __APPLE__
 	.text
 	.type	L_AES_Thumb2_te_gcm, %object
 	.size	L_AES_Thumb2_te_gcm, 12
-	.align	4
+#else
+	.section	__DATA,__data
+#endif /* __APPLE__ */
+	/* 8-byte aligned, 64-bit aligned */
+#ifndef __APPLE__
+	.align	3
+#else
+	.p2align	3
+#endif /* __APPLE__ */
 L_AES_Thumb2_te_gcm:
-	.word	L_AES_Thumb2_te_data
+	.long	L_AES_Thumb2_te_data
 	.text
 	.align	4
 	.globl	AES_GCM_encrypt
@@ -7795,6 +7267,6 @@ L_AES_GCM_encrypt_end:
 #endif /* WOLFSSL_ARMASM */
 
 #if defined(__linux__) && defined(__ELF__)
-.section        .note.GNU-stack,"",%progbits
+.section	.note.GNU-stack,"",%progbits
 #endif
 #endif /* !WOLFSSL_ARMASM_INLINE */
diff --git a/wolfcrypt/src/port/arm/thumb2-aes-asm_c.c b/wolfcrypt/src/port/arm/thumb2-aes-asm_c.c
index fd89d3973da..6d332507a92 100644
--- a/wolfcrypt/src/port/arm/thumb2-aes-asm_c.c
+++ b/wolfcrypt/src/port/arm/thumb2-aes-asm_c.c
@@ -41,12 +41,17 @@
 #define __asm__        __asm
 #define __volatile__   volatile
 #endif /* __KEIL__ */
+#ifdef __ghs__
+#define __asm__        __asm
+#define __volatile__
+#define WOLFSSL_NO_VAR_ASSIGN_REG
+#endif /* __ghs__ */
 
 #ifndef NO_AES
 #include <wolfssl/wolfcrypt/aes.h>
 
 #ifdef HAVE_AES_DECRYPT
-XALIGNED(16) static const word32 L_AES_Thumb2_td_data[] = {
+XALIGNED(8) static const word32 L_AES_Thumb2_td_data[] = {
     0x5051f4a7, 0x537e4165, 0xc31a17a4, 0x963a275e,
     0xcb3bab6b, 0xf11f9d45, 0xabacfa58, 0x934be303,
     0x552030fa, 0xf6ad766d, 0x9188cc76, 0x25f5024c,
@@ -117,7 +122,7 @@ XALIGNED(16) static const word32 L_AES_Thumb2_td_data[] = {
 #if defined(HAVE_AES_DECRYPT) || defined(HAVE_AES_CBC) || \
     defined(HAVE_AESCCM) || defined(HAVE_AESGCM) || \
     defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER)
-XALIGNED(16) static const word32 L_AES_Thumb2_te_data[] = {
+XALIGNED(8) static const word32 L_AES_Thumb2_te_data[] = {
     0xa5c66363, 0x84f87c7c, 0x99ee7777, 0x8df67b7b,
     0x0dfff2f2, 0xbdd66b6b, 0xb1de6f6f, 0x5491c5c5,
     0x50603030, 0x03020101, 0xa9ce6767, 0x7d562b2b,
@@ -196,7 +201,7 @@ static const word32* L_AES_Thumb2_te = L_AES_Thumb2_te_data;
 #endif /* HAVE_AES_DECRYPT || HAVE_AES_CBC || HAVE_AESCCM || HAVE_AESGCM ||
         * WOLFSSL_AES_DIRECT || WOLFSSL_AES_COUNTER */
 #ifdef HAVE_AES_DECRYPT
-void AES_invert_key(unsigned char* ks, word32 rounds);
+void AES_invert_key(unsigned char* ks_p, word32 rounds_p);
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
 WC_OMIT_FRAME_POINTER void AES_invert_key(unsigned char* ks_p, word32 rounds_p)
 #else
@@ -208,15 +213,11 @@ WC_OMIT_FRAME_POINTER void AES_invert_key(unsigned char* ks, word32 rounds)
     register word32 rounds __asm__ ("r1") = (word32)rounds_p;
     register word32* L_AES_Thumb2_te_c __asm__ ("r2") =
         (word32*)L_AES_Thumb2_te;
-
     register word32* L_AES_Thumb2_td_c __asm__ ("r3") =
         (word32*)L_AES_Thumb2_td;
-
 #else
     register word32* L_AES_Thumb2_te_c = (word32*)L_AES_Thumb2_te;
-
     register word32* L_AES_Thumb2_td_c = (word32*)L_AES_Thumb2_td;
-
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -325,24 +326,31 @@ WC_OMIT_FRAME_POINTER void AES_invert_key(unsigned char* ks, word32 rounds)
 #else
         "BNE.W	L_AES_invert_key_mix_loop_%=\n\t"
 #endif
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [ks] "+r" (ks), [rounds] "+r" (rounds),
           [L_AES_Thumb2_te] "+r" (L_AES_Thumb2_te_c),
           [L_AES_Thumb2_td] "+r" (L_AES_Thumb2_td_c)
         :
+#else
+        :
+        : [ks] "r" (ks), [rounds] "r" (rounds),
+          [L_AES_Thumb2_te] "r" (L_AES_Thumb2_te_c),
+          [L_AES_Thumb2_td] "r" (L_AES_Thumb2_td_c)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9",
             "r10", "r11"
     );
 }
 
 #endif /* HAVE_AES_DECRYPT */
-XALIGNED(16) static const word32 L_AES_Thumb2_rcon[] = {
+XALIGNED(8) static const word32 L_AES_Thumb2_rcon[] = {
     0x01000000, 0x02000000, 0x04000000, 0x08000000,
     0x10000000, 0x20000000, 0x40000000, 0x80000000,
     0x1b000000, 0x36000000
 };
 
-void AES_set_encrypt_key(const unsigned char* key, word32 len,
-        unsigned char* ks);
+void AES_set_encrypt_key(const unsigned char* key_p, word32 len_p,
+    unsigned char* ks_p);
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
 WC_OMIT_FRAME_POINTER void AES_set_encrypt_key(const unsigned char* key_p,
     word32 len_p, unsigned char* ks_p)
@@ -358,15 +366,11 @@ WC_OMIT_FRAME_POINTER void AES_set_encrypt_key(const unsigned char* key,
     register unsigned char* ks __asm__ ("r2") = (unsigned char*)ks_p;
     register word32* L_AES_Thumb2_te_c __asm__ ("r3") =
         (word32*)L_AES_Thumb2_te;
-
     register word32* L_AES_Thumb2_rcon_c __asm__ ("r4") =
         (word32*)&L_AES_Thumb2_rcon;
-
 #else
     register word32* L_AES_Thumb2_te_c = (word32*)L_AES_Thumb2_te;
-
     register word32* L_AES_Thumb2_rcon_c = (word32*)&L_AES_Thumb2_rcon;
-
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -629,16 +633,24 @@ WC_OMIT_FRAME_POINTER void AES_set_encrypt_key(const unsigned char* key,
 #else
     "L_AES_set_encrypt_key_end_%=:\n\t"
 #endif
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [key] "+r" (key), [len] "+r" (len), [ks] "+r" (ks),
           [L_AES_Thumb2_te] "+r" (L_AES_Thumb2_te_c),
           [L_AES_Thumb2_rcon] "+r" (L_AES_Thumb2_rcon_c)
         :
+#else
+        :
+        : [key] "r" (key), [len] "r" (len), [ks] "r" (ks),
+          [L_AES_Thumb2_te] "r" (L_AES_Thumb2_te_c),
+          [L_AES_Thumb2_rcon] "r" (L_AES_Thumb2_rcon_c)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r12", "lr", "r5", "r6", "r7", "r8", "r9", "r10"
     );
 }
 
 #ifndef WOLFSSL_ARMASM_AES_BLOCK_INLINE
-void AES_encrypt_block(const word32* te, int nr, int len, const word32* ks);
+void AES_encrypt_block(const word32* te_p, int nr_p, int len_p,
+    const word32* ks_p);
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
 WC_OMIT_FRAME_POINTER void AES_encrypt_block(const word32* te_p, int nr_p,
     int len_p, const word32* ks_p)
@@ -869,8 +881,13 @@ WC_OMIT_FRAME_POINTER void AES_encrypt_block(const word32* te, int nr, int len,
         "EOR	r5, r5, r9\n\t"
         "EOR	r6, r6, r10\n\t"
         "EOR	r7, r7, r11\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [te] "+r" (te), [nr] "+r" (nr), [len] "+r" (len), [ks] "+r" (ks)
         :
+#else
+        :
+        : [te] "r" (te), [nr] "r" (nr), [len] "r" (len), [ks] "r" (ks)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "lr"
     );
 }
@@ -884,8 +901,8 @@ static const word32* L_AES_Thumb2_te_ecb = L_AES_Thumb2_te_data;
 #if defined(HAVE_AESCCM) || defined(HAVE_AESGCM) || \
     defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) || \
     defined(HAVE_AES_ECB)
-void AES_ECB_encrypt(const unsigned char* in, unsigned char* out,
-        unsigned long len, const unsigned char* ks, int nr);
+void AES_ECB_encrypt(const unsigned char* in_p, unsigned char* out_p,
+    unsigned long len_p, const unsigned char* ks_p, int nr_p);
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
 WC_OMIT_FRAME_POINTER void AES_ECB_encrypt(const unsigned char* in_p,
     unsigned char* out_p, unsigned long len_p, const unsigned char* ks_p,
@@ -905,10 +922,8 @@ WC_OMIT_FRAME_POINTER void AES_ECB_encrypt(const unsigned char* in,
     register int nr __asm__ ("r4") = (int)nr_p;
     register word32* L_AES_Thumb2_te_ecb_c __asm__ ("r5") =
         (word32*)L_AES_Thumb2_te_ecb;
-
 #else
     register word32* L_AES_Thumb2_te_ecb_c = (word32*)L_AES_Thumb2_te_ecb;
-
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -1749,9 +1764,15 @@ WC_OMIT_FRAME_POINTER void AES_ECB_encrypt(const unsigned char* in,
     "L_AES_ECB_encrypt_end_%=:\n\t"
 #endif
         "POP	{%[ks]}\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks),
           [nr] "+r" (nr), [L_AES_Thumb2_te_ecb] "+r" (L_AES_Thumb2_te_ecb_c)
         :
+#else
+        :
+        : [in] "r" (in), [out] "r" (out), [len] "r" (len), [ks] "r" (ks),
+          [nr] "r" (nr), [L_AES_Thumb2_te_ecb] "r" (L_AES_Thumb2_te_ecb_c)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r12", "lr", "r6", "r7", "r8", "r9", "r10", "r11"
     );
 }
@@ -1759,8 +1780,9 @@ WC_OMIT_FRAME_POINTER void AES_ECB_encrypt(const unsigned char* in,
 #endif /* HAVE_AESCCM || HAVE_AESGCM || WOLFSSL_AES_DIRECT ||
         * WOLFSSL_AES_COUNTER || HAVE_AES_ECB */
 #ifdef HAVE_AES_CBC
-void AES_CBC_encrypt(const unsigned char* in, unsigned char* out,
-        unsigned long len, const unsigned char* ks, int nr, unsigned char* iv);
+void AES_CBC_encrypt(const unsigned char* in_p, unsigned char* out_p,
+    unsigned long len_p, const unsigned char* ks_p, int nr_p,
+    unsigned char* iv_p);
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
 WC_OMIT_FRAME_POINTER void AES_CBC_encrypt(const unsigned char* in_p,
     unsigned char* out_p, unsigned long len_p, const unsigned char* ks_p,
@@ -1782,10 +1804,8 @@ WC_OMIT_FRAME_POINTER void AES_CBC_encrypt(const unsigned char* in,
     register unsigned char* iv __asm__ ("r5") = (unsigned char*)iv_p;
     register word32* L_AES_Thumb2_te_ecb_c __asm__ ("r6") =
         (word32*)L_AES_Thumb2_te_ecb;
-
 #else
     register word32* L_AES_Thumb2_te_ecb_c = (word32*)L_AES_Thumb2_te_ecb;
-
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -2645,10 +2665,17 @@ WC_OMIT_FRAME_POINTER void AES_CBC_encrypt(const unsigned char* in,
 #endif
         "POP	{%[ks], r9}\n\t"
         "STM	r9, {r4, r5, r6, r7}\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks),
           [nr] "+r" (nr), [iv] "+r" (iv),
           [L_AES_Thumb2_te_ecb] "+r" (L_AES_Thumb2_te_ecb_c)
         :
+#else
+        :
+        : [in] "r" (in), [out] "r" (out), [len] "r" (len), [ks] "r" (ks),
+          [nr] "r" (nr), [iv] "r" (iv),
+          [L_AES_Thumb2_te_ecb] "r" (L_AES_Thumb2_te_ecb_c)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r12", "lr", "r7", "r8", "r9", "r10", "r11"
     );
 }
@@ -2656,8 +2683,9 @@ WC_OMIT_FRAME_POINTER void AES_CBC_encrypt(const unsigned char* in,
 #endif /* HAVE_AES_CBC */
 #ifdef WOLFSSL_AES_COUNTER
 static const word32* L_AES_Thumb2_te_ctr = L_AES_Thumb2_te_data;
-void AES_CTR_encrypt(const unsigned char* in, unsigned char* out,
-        unsigned long len, const unsigned char* ks, int nr, unsigned char* ctr);
+void AES_CTR_encrypt(const unsigned char* in_p, unsigned char* out_p,
+    unsigned long len_p, const unsigned char* ks_p, int nr_p,
+    unsigned char* ctr_p);
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
 WC_OMIT_FRAME_POINTER void AES_CTR_encrypt(const unsigned char* in_p,
     unsigned char* out_p, unsigned long len_p, const unsigned char* ks_p,
@@ -2679,10 +2707,8 @@ WC_OMIT_FRAME_POINTER void AES_CTR_encrypt(const unsigned char* in,
     register unsigned char* ctr __asm__ ("r5") = (unsigned char*)ctr_p;
     register word32* L_AES_Thumb2_te_ctr_c __asm__ ("r6") =
         (word32*)L_AES_Thumb2_te_ctr;
-
 #else
     register word32* L_AES_Thumb2_te_ctr_c = (word32*)L_AES_Thumb2_te_ctr;
-
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -3563,10 +3589,17 @@ WC_OMIT_FRAME_POINTER void AES_CTR_encrypt(const unsigned char* in,
         "REV	r6, r6\n\t"
         "REV	r7, r7\n\t"
         "STM	r8, {r4, r5, r6, r7}\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks),
           [nr] "+r" (nr), [ctr] "+r" (ctr),
           [L_AES_Thumb2_te_ctr] "+r" (L_AES_Thumb2_te_ctr_c)
         :
+#else
+        :
+        : [in] "r" (in), [out] "r" (out), [len] "r" (len), [ks] "r" (ks),
+          [nr] "r" (nr), [ctr] "r" (ctr),
+          [L_AES_Thumb2_te_ctr] "r" (L_AES_Thumb2_te_ctr_c)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r12", "lr", "r7", "r8", "r9", "r10", "r11"
     );
 }
@@ -3576,7 +3609,7 @@ WC_OMIT_FRAME_POINTER void AES_CTR_encrypt(const unsigned char* in,
 #if defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) || \
     defined(HAVE_AES_CBC) || defined(HAVE_AES_ECB)
 #ifndef WOLFSSL_ARMASM_AES_BLOCK_INLINE
-void AES_decrypt_block(const word32* td, int nr, const byte* td4);
+void AES_decrypt_block(const word32* td_p, int nr_p, const byte* td4_p);
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
 WC_OMIT_FRAME_POINTER void AES_decrypt_block(const word32* td_p, int nr_p,
     const byte* td4_p)
@@ -3806,15 +3839,20 @@ WC_OMIT_FRAME_POINTER void AES_decrypt_block(const word32* td, int nr,
         "EOR	r5, r5, r9\n\t"
         "EOR	r6, r6, r10\n\t"
         "EOR	r7, r7, r11\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [td] "+r" (td), [nr] "+r" (nr), [td4] "+r" (td4)
         :
+#else
+        :
+        : [td] "r" (td), [nr] "r" (nr), [td4] "r" (td4)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "lr"
     );
 }
 
 #endif /* !WOLFSSL_ARMASM_AES_BLOCK_INLINE */
 static const word32* L_AES_Thumb2_td_ecb = L_AES_Thumb2_td_data;
-static const byte L_AES_Thumb2_td4[] = {
+XALIGNED(4) static const word8 L_AES_Thumb2_td4[] = {
     0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38,
     0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
     0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87,
@@ -3849,9 +3887,10 @@ static const byte L_AES_Thumb2_td4[] = {
     0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d,
 };
 
-#if defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) || defined(HAVE_AES_ECB)
-void AES_ECB_decrypt(const unsigned char* in, unsigned char* out,
-        unsigned long len, const unsigned char* ks, int nr);
+#if defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) || \
+        defined(HAVE_AES_ECB)
+void AES_ECB_decrypt(const unsigned char* in_p, unsigned char* out_p,
+    unsigned long len_p, const unsigned char* ks_p, int nr_p);
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
 WC_OMIT_FRAME_POINTER void AES_ECB_decrypt(const unsigned char* in_p,
     unsigned char* out_p, unsigned long len_p, const unsigned char* ks_p,
@@ -3871,15 +3910,11 @@ WC_OMIT_FRAME_POINTER void AES_ECB_decrypt(const unsigned char* in,
     register int nr __asm__ ("r4") = (int)nr_p;
     register word32* L_AES_Thumb2_td_ecb_c __asm__ ("r5") =
         (word32*)L_AES_Thumb2_td_ecb;
-
-    register byte* L_AES_Thumb2_td4_c __asm__ ("r6") =
-        (byte*)&L_AES_Thumb2_td4;
-
+    register word8* L_AES_Thumb2_td4_c __asm__ ("r6") =
+        (word8*)&L_AES_Thumb2_td4;
 #else
     register word32* L_AES_Thumb2_td_ecb_c = (word32*)L_AES_Thumb2_td_ecb;
-
-    register byte* L_AES_Thumb2_td4_c = (byte*)&L_AES_Thumb2_td4;
-
+    register word8* L_AES_Thumb2_td4_c = (word8*)&L_AES_Thumb2_td4;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -4717,18 +4752,26 @@ WC_OMIT_FRAME_POINTER void AES_ECB_decrypt(const unsigned char* in,
 #else
     "L_AES_ECB_decrypt_end_%=:\n\t"
 #endif
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks),
           [nr] "+r" (nr), [L_AES_Thumb2_td_ecb] "+r" (L_AES_Thumb2_td_ecb_c),
           [L_AES_Thumb2_td4] "+r" (L_AES_Thumb2_td4_c)
         :
+#else
+        :
+        : [in] "r" (in), [out] "r" (out), [len] "r" (len), [ks] "r" (ks),
+          [nr] "r" (nr), [L_AES_Thumb2_td_ecb] "r" (L_AES_Thumb2_td_ecb_c),
+          [L_AES_Thumb2_td4] "r" (L_AES_Thumb2_td4_c)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r12", "lr", "r7", "r8", "r9", "r10", "r11"
     );
 }
 
 #endif /* WOLFSSL_AES_DIRECT || WOLFSSL_AES_COUNTER || defined(HAVE_AES_ECB) */
 #ifdef HAVE_AES_CBC
-void AES_CBC_decrypt(const unsigned char* in, unsigned char* out,
-        unsigned long len, const unsigned char* ks, int nr, unsigned char* iv);
+void AES_CBC_decrypt(const unsigned char* in_p, unsigned char* out_p,
+    unsigned long len_p, const unsigned char* ks_p, int nr_p,
+    unsigned char* iv_p);
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
 WC_OMIT_FRAME_POINTER void AES_CBC_decrypt(const unsigned char* in_p,
     unsigned char* out_p, unsigned long len_p, const unsigned char* ks_p,
@@ -4750,15 +4793,11 @@ WC_OMIT_FRAME_POINTER void AES_CBC_decrypt(const unsigned char* in,
     register unsigned char* iv __asm__ ("r5") = (unsigned char*)iv_p;
     register word32* L_AES_Thumb2_td_ecb_c __asm__ ("r6") =
         (word32*)L_AES_Thumb2_td_ecb;
-
-    register byte* L_AES_Thumb2_td4_c __asm__ ("r7") =
-        (byte*)&L_AES_Thumb2_td4;
-
+    register word8* L_AES_Thumb2_td4_c __asm__ ("r7") =
+        (word8*)&L_AES_Thumb2_td4;
 #else
     register word32* L_AES_Thumb2_td_ecb_c = (word32*)L_AES_Thumb2_td_ecb;
-
-    register byte* L_AES_Thumb2_td4_c = (byte*)&L_AES_Thumb2_td4;
-
+    register word8* L_AES_Thumb2_td4_c = (word8*)&L_AES_Thumb2_td4;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -6431,11 +6470,19 @@ WC_OMIT_FRAME_POINTER void AES_CBC_decrypt(const unsigned char* in,
     "L_AES_CBC_decrypt_end_%=:\n\t"
 #endif
         "POP	{%[ks], r4}\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks),
           [nr] "+r" (nr), [iv] "+r" (iv),
           [L_AES_Thumb2_td_ecb] "+r" (L_AES_Thumb2_td_ecb_c),
           [L_AES_Thumb2_td4] "+r" (L_AES_Thumb2_td4_c)
         :
+#else
+        :
+        : [in] "r" (in), [out] "r" (out), [len] "r" (len), [ks] "r" (ks),
+          [nr] "r" (nr), [iv] "r" (iv),
+          [L_AES_Thumb2_td_ecb] "r" (L_AES_Thumb2_td_ecb_c),
+          [L_AES_Thumb2_td4] "r" (L_AES_Thumb2_td4_c)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r12", "lr", "r8", "r9", "r10", "r11"
     );
 }
@@ -6445,15 +6492,15 @@ WC_OMIT_FRAME_POINTER void AES_CBC_decrypt(const unsigned char* in,
         * HAVE_AES_ECB */
 #endif /* HAVE_AES_DECRYPT */
 #ifdef HAVE_AESGCM
-XALIGNED(16) static const word32 L_GCM_gmult_len_r[] = {
+XALIGNED(8) static const word32 L_GCM_gmult_len_r[] = {
     0x00000000, 0x1c200000, 0x38400000, 0x24600000,
     0x70800000, 0x6ca00000, 0x48c00000, 0x54e00000,
     0xe1000000, 0xfd200000, 0xd9400000, 0xc5600000,
     0x91800000, 0x8da00000, 0xa9c00000, 0xb5e00000,
 };
 
-void GCM_gmult_len(unsigned char* x, const unsigned char** m,
-        const unsigned char* data, unsigned long len);
+void GCM_gmult_len(unsigned char* x_p, const unsigned char** m_p,
+    const unsigned char* data_p, unsigned long len_p);
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
 WC_OMIT_FRAME_POINTER void GCM_gmult_len(unsigned char* x_p,
     const unsigned char** m_p, const unsigned char* data_p, unsigned long len_p)
@@ -6471,10 +6518,8 @@ WC_OMIT_FRAME_POINTER void GCM_gmult_len(unsigned char* x,
     register unsigned long len __asm__ ("r3") = (unsigned long)len_p;
     register word32* L_GCM_gmult_len_r_c __asm__ ("r4") =
         (word32*)&L_GCM_gmult_len_r;
-
 #else
     register word32* L_GCM_gmult_len_r_c = (word32*)&L_GCM_gmult_len_r;
-
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -7036,17 +7081,24 @@ WC_OMIT_FRAME_POINTER void GCM_gmult_len(unsigned char* x,
 #else
         "BNE.W	L_GCM_gmult_len_start_block_%=\n\t"
 #endif
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [x] "+r" (x), [m] "+r" (m), [data] "+r" (data), [len] "+r" (len),
           [L_GCM_gmult_len_r] "+r" (L_GCM_gmult_len_r_c)
         :
+#else
+        :
+        : [x] "r" (x), [m] "r" (m), [data] "r" (data), [len] "r" (len),
+          [L_GCM_gmult_len_r] "r" (L_GCM_gmult_len_r_c)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r12", "lr", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11"
     );
 }
 
 static const word32* L_AES_Thumb2_te_gcm = L_AES_Thumb2_te_data;
-void AES_GCM_encrypt(const unsigned char* in, unsigned char* out,
-        unsigned long len, const unsigned char* ks, int nr, unsigned char* ctr);
+void AES_GCM_encrypt(const unsigned char* in_p, unsigned char* out_p,
+    unsigned long len_p, const unsigned char* ks_p, int nr_p,
+    unsigned char* ctr_p);
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
 WC_OMIT_FRAME_POINTER void AES_GCM_encrypt(const unsigned char* in_p,
     unsigned char* out_p, unsigned long len_p, const unsigned char* ks_p,
@@ -7068,10 +7120,8 @@ WC_OMIT_FRAME_POINTER void AES_GCM_encrypt(const unsigned char* in,
     register unsigned char* ctr __asm__ ("r5") = (unsigned char*)ctr_p;
     register word32* L_AES_Thumb2_te_gcm_c __asm__ ("r6") =
         (word32*)L_AES_Thumb2_te_gcm;
-
 #else
     register word32* L_AES_Thumb2_te_gcm_c = (word32*)L_AES_Thumb2_te_gcm;
-
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -7943,10 +7993,17 @@ WC_OMIT_FRAME_POINTER void AES_GCM_encrypt(const unsigned char* in,
         "REV	r6, r6\n\t"
         "REV	r7, r7\n\t"
         "STM	r8, {r4, r5, r6, r7}\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks),
           [nr] "+r" (nr), [ctr] "+r" (ctr),
           [L_AES_Thumb2_te_gcm] "+r" (L_AES_Thumb2_te_gcm_c)
         :
+#else
+        :
+        : [in] "r" (in), [out] "r" (out), [len] "r" (len), [ks] "r" (ks),
+          [nr] "r" (nr), [ctr] "r" (ctr),
+          [L_AES_Thumb2_te_gcm] "r" (L_AES_Thumb2_te_gcm_c)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r12", "lr", "r7", "r8", "r9", "r10", "r11"
     );
 }
diff --git a/wolfcrypt/src/port/arm/thumb2-chacha-asm.S b/wolfcrypt/src/port/arm/thumb2-chacha-asm.S
index be046d02bb2..775c3f51483 100644
--- a/wolfcrypt/src/port/arm/thumb2-chacha-asm.S
+++ b/wolfcrypt/src/port/arm/thumb2-chacha-asm.S
@@ -53,19 +53,22 @@ wc_chacha_setiv:
 	POP	{r4, r5, r6, pc}
 	/* Cycle Count = 26 */
 	.size	wc_chacha_setiv,.-wc_chacha_setiv
+#ifndef __APPLE__
 	.text
 	.type	L_chacha_thumb2_constants, %object
 	.size	L_chacha_thumb2_constants, 32
-	.align	4
+#else
+	.section	__DATA,__data
+#endif /* __APPLE__ */
+	/* 8-byte aligned, 64-bit aligned */
+#ifndef __APPLE__
+	.align	3
+#else
+	.p2align	3
+#endif /* __APPLE__ */
 L_chacha_thumb2_constants:
-	.word	0x61707865
-	.word	0x3120646e
-	.word	0x79622d36
-	.word	0x6b206574
-	.word	0x61707865
-	.word	0x3320646e
-	.word	0x79622d32
-	.word	0x6b206574
+	.long	0x61707865,0x3120646e,0x79622d36,0x6b206574
+	.long	0x61707865,0x3320646e,0x79622d32,0x6b206574
 	.text
 	.align	4
 	.globl	wc_chacha_setkey
@@ -568,6 +571,6 @@ L_chacha_thumb2_over_done:
 #endif /* WOLFSSL_ARMASM */
 
 #if defined(__linux__) && defined(__ELF__)
-.section        .note.GNU-stack,"",%progbits
+.section	.note.GNU-stack,"",%progbits
 #endif
 #endif /* !WOLFSSL_ARMASM_INLINE */
diff --git a/wolfcrypt/src/port/arm/thumb2-chacha-asm_c.c b/wolfcrypt/src/port/arm/thumb2-chacha-asm_c.c
index 4d2627524bc..cfaf6fa2ddb 100644
--- a/wolfcrypt/src/port/arm/thumb2-chacha-asm_c.c
+++ b/wolfcrypt/src/port/arm/thumb2-chacha-asm_c.c
@@ -41,6 +41,11 @@
 #define __asm__        __asm
 #define __volatile__   volatile
 #endif /* __KEIL__ */
+#ifdef __ghs__
+#define __asm__        __asm
+#define __volatile__
+#define WOLFSSL_NO_VAR_ASSIGN_REG
+#endif /* __ghs__ */
 
 #ifdef HAVE_CHACHA
 #include <wolfssl/wolfcrypt/chacha.h>
@@ -71,13 +76,18 @@ WC_OMIT_FRAME_POINTER void wc_chacha_setiv(word32* x, const byte* iv,
         "REV	r6, r6\n\t"
 #endif /* BIG_ENDIAN_ORDER */
         "STM	r3, {r4, r5, r6}\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [x] "+r" (x), [iv] "+r" (iv), [counter] "+r" (counter)
         :
+#else
+        :
+        : [x] "r" (x), [iv] "r" (iv), [counter] "r" (counter)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6"
     );
 }
 
-XALIGNED(16) static const word32 L_chacha_thumb2_constants[] = {
+XALIGNED(8) static const word32 L_chacha_thumb2_constants[] = {
     0x61707865, 0x3120646e, 0x79622d36, 0x6b206574,
     0x61707865, 0x3320646e, 0x79622d32, 0x6b206574,
 };
@@ -96,11 +106,9 @@ WC_OMIT_FRAME_POINTER void wc_chacha_setkey(word32* x, const byte* key,
     register word32 keySz __asm__ ("r2") = (word32)keySz_p;
     register word32* L_chacha_thumb2_constants_c __asm__ ("r3") =
         (word32*)&L_chacha_thumb2_constants;
-
 #else
     register word32* L_chacha_thumb2_constants_c =
         (word32*)&L_chacha_thumb2_constants;
-
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -143,9 +151,15 @@ WC_OMIT_FRAME_POINTER void wc_chacha_setkey(word32* x, const byte* key,
     "L_chacha_thumb2_setkey_same_key_bytes_%=:\n\t"
 #endif
         "STM	%[x], {r3, r4, r5, r6}\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [x] "+r" (x), [key] "+r" (key), [keySz] "+r" (keySz),
           [L_chacha_thumb2_constants] "+r" (L_chacha_thumb2_constants_c)
         :
+#else
+        :
+        : [x] "r" (x), [key] "r" (key), [keySz] "r" (keySz),
+          [L_chacha_thumb2_constants] "r" (L_chacha_thumb2_constants_c)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r4", "r5", "r6", "r7"
     );
 }
@@ -583,8 +597,13 @@ WC_OMIT_FRAME_POINTER void wc_chacha_crypt_bytes(ChaCha* ctx, byte* c,
     "L_chacha_thumb2_crypt_done_%=:\n\t"
 #endif
         "ADD	sp, sp, #0x34\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [ctx] "+r" (ctx), [c] "+r" (c), [m] "+r" (m), [len] "+r" (len)
         :
+#else
+        :
+        : [ctx] "r" (ctx), [c] "r" (c), [m] "r" (m), [len] "r" (len)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
             "r12", "lr"
     );
@@ -727,9 +746,15 @@ WC_OMIT_FRAME_POINTER void wc_chacha_use_over(byte* over, byte* output,
 #else
     "L_chacha_thumb2_over_done_%=:\n\t"
 #endif
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [over] "+r" (over), [output] "+r" (output), [input] "+r" (input),
           [len] "+r" (len)
         :
+#else
+        :
+        : [over] "r" (over), [output] "r" (output), [input] "r" (input),
+          [len] "r" (len)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11"
     );
 }
diff --git a/wolfcrypt/src/port/arm/thumb2-curve25519.S b/wolfcrypt/src/port/arm/thumb2-curve25519.S
index bbacc310e47..27acee8b1cb 100644
--- a/wolfcrypt/src/port/arm/thumb2-curve25519.S
+++ b/wolfcrypt/src/port/arm/thumb2-curve25519.S
@@ -6480,6 +6480,6 @@ sc_muladd:
 #endif /* WOLFSSL_ARMASM */
 
 #if defined(__linux__) && defined(__ELF__)
-.section        .note.GNU-stack,"",%progbits
+.section	.note.GNU-stack,"",%progbits
 #endif
 #endif /* !WOLFSSL_ARMASM_INLINE */
diff --git a/wolfcrypt/src/port/arm/thumb2-curve25519_c.c b/wolfcrypt/src/port/arm/thumb2-curve25519_c.c
index 23816a095a7..e351b4349e2 100644
--- a/wolfcrypt/src/port/arm/thumb2-curve25519_c.c
+++ b/wolfcrypt/src/port/arm/thumb2-curve25519_c.c
@@ -41,6 +41,11 @@
 #define __asm__        __asm
 #define __volatile__   volatile
 #endif /* __KEIL__ */
+#ifdef __ghs__
+#define __asm__        __asm
+#define __volatile__
+#define WOLFSSL_NO_VAR_ASSIGN_REG
+#endif /* __ghs__ */
 
 /* Based on work by: Emil Lenngren
  * https://github.com/pornin/X25519-Cortex-M4
@@ -63,8 +68,13 @@ WC_OMIT_FRAME_POINTER void fe_init()
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
     __asm__ __volatile__ (
         "\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
+        :
+        :
+#else
         :
         :
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc"
     );
 }
@@ -166,8 +176,13 @@ WC_OMIT_FRAME_POINTER void fe_add_sub_op()
         "SBC	r11, r11, #0x0\n\t"
         "STM	r1, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t"
         /* Done Add-Sub */
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
+        :
+        :
+#else
         :
         :
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "lr"
     );
 }
@@ -207,8 +222,13 @@ WC_OMIT_FRAME_POINTER void fe_sub_op()
         "SBC	lr, lr, #0x0\n\t"
         "STM	r0, {r6, r7, r8, r9, r10, r11, r12, lr}\n\t"
         /* Done Sub */
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
+        :
+        :
+#else
         :
         :
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "lr"
     );
 }
@@ -227,8 +247,13 @@ WC_OMIT_FRAME_POINTER void fe_sub(fe r, const fe a, const fe b)
 
     __asm__ __volatile__ (
         "BL	fe_sub_op\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11", "r12", "lr"
     );
@@ -269,8 +294,13 @@ WC_OMIT_FRAME_POINTER void fe_add_op()
         "ADC	lr, lr, #0x0\n\t"
         "STM	r0, {r6, r7, r8, r9, r10, r11, r12, lr}\n\t"
         /* Done Add */
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         :
         :
+#else
+        :
+        :
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "lr"
     );
 }
@@ -289,8 +319,13 @@ WC_OMIT_FRAME_POINTER void fe_add(fe r, const fe a, const fe b)
 
     __asm__ __volatile__ (
         "BL	fe_add_op\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11", "r12", "lr"
     );
@@ -327,8 +362,13 @@ WC_OMIT_FRAME_POINTER void fe_frombytes(fe out, const unsigned char* in)
         "STR	r7, [%[out], #20]\n\t"
         "STR	r8, [%[out], #24]\n\t"
         "STR	r9, [%[out], #28]\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [out] "+r" (out), [in] "+r" (in)
         :
+#else
+        :
+        : [out] "r" (out), [in] "r" (in)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9"
     );
 }
@@ -373,8 +413,13 @@ WC_OMIT_FRAME_POINTER void fe_tobytes(unsigned char* out, const fe n)
         "STR	r7, [%[out], #20]\n\t"
         "STR	r8, [%[out], #24]\n\t"
         "STR	r9, [%[out], #28]\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [out] "+r" (out), [n] "+r" (n)
         :
+#else
+        :
+        : [out] "r" (out), [n] "r" (n)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
 }
@@ -400,8 +445,13 @@ WC_OMIT_FRAME_POINTER void fe_1(fe n)
         "MOV	r8, #0x0\n\t"
         "MOV	r9, #0x0\n\t"
         "STM	%[n], {r2, r3, r4, r5, r6, r7, r8, r9}\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [n] "+r" (n)
         :
+#else
+        :
+        : [n] "r" (n)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9"
     );
 }
@@ -427,8 +477,13 @@ WC_OMIT_FRAME_POINTER void fe_0(fe n)
         "MOV	r8, #0x0\n\t"
         "MOV	r9, #0x0\n\t"
         "STM	%[n], {r2, r3, r4, r5, r6, r7, r8, r9}\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [n] "+r" (n)
         :
+#else
+        :
+        : [n] "r" (n)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9"
     );
 }
@@ -454,8 +509,13 @@ WC_OMIT_FRAME_POINTER void fe_copy(fe r, const fe a)
         "LDRD	r4, r5, [%[a], #24]\n\t"
         "STRD	r2, r3, [%[r], #16]\n\t"
         "STRD	r4, r5, [%[r], #24]\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r2", "r3", "r4", "r5"
     );
 }
@@ -487,8 +547,13 @@ WC_OMIT_FRAME_POINTER void fe_neg(fe r, const fe a)
         "SBCS	r4, r7, r4\n\t"
         "SBC	r5, r6, r5\n\t"
         "STM	%[r]!, {r2, r3, r4, r5}\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7"
     );
 }
@@ -531,8 +596,13 @@ WC_OMIT_FRAME_POINTER int fe_isnonzero(const fe a)
         "ORR	r4, r4, r6\n\t"
         "ORR	r2, r2, r8\n\t"
         "ORR	%[a], r2, r4\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a)
         :
+#else
+        :
+        : [a] "r" (a)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9",
             "r10"
     );
@@ -564,14 +634,20 @@ WC_OMIT_FRAME_POINTER int fe_isnegative(const fe a)
         "AND	%[a], r2, #0x1\n\t"
         "LSR	r1, r1, #31\n\t"
         "EOR	%[a], %[a], r1\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a)
         :
+#else
+        :
+        : [a] "r" (a)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r1", "r2", "r3", "r4", "r5"
     );
     return (word32)(size_t)a;
 }
 
-#if defined(HAVE_ED25519_MAKE_KEY) || defined(HAVE_ED25519_SIGN) || defined(WOLFSSL_CURVE25519_USE_ED25519)
+#if defined(HAVE_ED25519_MAKE_KEY) || defined(HAVE_ED25519_SIGN) || \
+        defined(WOLFSSL_CURVE25519_USE_ED25519)
 #ifndef WC_NO_CACHE_RESISTANT
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
 WC_OMIT_FRAME_POINTER void fe_cmov_table(fe* r_p, const fe* base_p,
@@ -1550,8 +1626,13 @@ WC_OMIT_FRAME_POINTER void fe_cmov_table(fe* r, const fe* base, signed char b)
         "STRD	r4, r5, [%[r], #24]\n\t"
         "STRD	r6, r7, [%[r], #56]\n\t"
         "STRD	r8, r9, [%[r], #88]\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [base] "+r" (base), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [base] "r" (base), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r3", "r10",
             "r11", "r12", "lr"
     );
@@ -1664,15 +1745,21 @@ WC_OMIT_FRAME_POINTER void fe_cmov_table(fe* r, const fe* base, signed char b)
         "AND	r7, r7, lr\n\t"
         "STM	%[r]!, {r4, r5, r6, r7}\n\t"
         "SUB	%[base], %[base], %[b]\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [base] "+r" (base), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [base] "r" (base), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11", "r12", "lr"
     );
 }
 
 #endif /* WC_NO_CACHE_RESISTANT */
-#endif /* HAVE_ED25519_MAKE_KEY || HAVE_ED25519_SIGN || WOLFSSL_CURVE25519_USE_ED25519 */
+#endif /* HAVE_ED25519_MAKE_KEY || HAVE_ED25519_SIGN ||
+        * WOLFSSL_CURVE25519_USE_ED25519 */
 #endif /* HAVE_ED25519 || WOLFSSL_CURVE25519_USE_ED25519 */
 #ifdef WOLFSSL_ARM_ARCH_7M
 void fe_mul_op(void);
@@ -2054,8 +2141,13 @@ WC_OMIT_FRAME_POINTER void fe_mul_op()
         "LDR	r0, [sp, #36]\n\t"
         "STM	r0, {r1, r2, r3, r4, r5, r6, r7, r8}\n\t"
         "ADD	sp, sp, #0x28\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         :
         :
+#else
+        :
+        :
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "lr"
     );
 }
@@ -2194,8 +2286,13 @@ WC_OMIT_FRAME_POINTER void fe_mul_op()
         /* Store */
         "STM	lr, {r0, r1, r2, r3, r4, r5, r6, r7}\n\t"
         "ADD	sp, sp, #0x10\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         :
         :
+#else
+        :
+        :
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "lr"
     );
 }
@@ -2215,8 +2312,13 @@ WC_OMIT_FRAME_POINTER void fe_mul(fe r, const fe a, const fe b)
 
     __asm__ __volatile__ (
         "BL	fe_mul_op\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11", "r12", "lr"
     );
@@ -2495,8 +2597,13 @@ WC_OMIT_FRAME_POINTER void fe_sq_op()
         "LDR	r0, [sp, #64]\n\t"
         "STM	r0, {r1, r2, r3, r4, r5, r6, r7, r8}\n\t"
         "ADD	sp, sp, #0x44\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
+        :
+        :
+#else
         :
         :
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "lr"
     );
 }
@@ -2621,8 +2728,13 @@ WC_OMIT_FRAME_POINTER void fe_sq_op()
         "POP	{lr}\n\t"
         /* Store */
         "STM	lr, {r0, r1, r2, r3, r4, r5, r6, r7}\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
+        :
+        :
+#else
         :
         :
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "lr"
     );
 }
@@ -2641,8 +2753,13 @@ WC_OMIT_FRAME_POINTER void fe_sq(fe r, const fe a)
 
     __asm__ __volatile__ (
         "BL	fe_sq_op\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11", "r12", "lr"
     );
@@ -2702,8 +2819,13 @@ WC_OMIT_FRAME_POINTER void fe_mul121666(fe r, fe a)
         "ADCS	r8, r8, #0x0\n\t"
         "ADC	r9, r9, #0x0\n\t"
         "STM	%[r], {r2, r3, r4, r5, r6, r7, r8, r9}\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11", "r12"
     );
@@ -2749,8 +2871,13 @@ WC_OMIT_FRAME_POINTER void fe_mul121666(fe r, fe a)
         "ADCS	r8, r8, #0x0\n\t"
         "ADC	r9, r9, #0x0\n\t"
         "STM	%[r], {r2, r3, r4, r5, r6, r7, r8, r9}\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11", "r12"
     );
@@ -3247,8 +3374,13 @@ WC_OMIT_FRAME_POINTER int curve25519(byte* r, const byte* n, const byte* a)
         "BL	fe_mul_op\n\t"
         "MOV	r0, #0x0\n\t"
         "ADD	sp, sp, #0xbc\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [n] "+r" (n), [a] "+r" (a)
         :
+#else
+        :
+        : [r] "r" (r), [n] "r" (n), [a] "r" (a)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
             "r3", "r12", "lr"
     );
@@ -3655,8 +3787,13 @@ WC_OMIT_FRAME_POINTER int curve25519(byte* r, const byte* n, const byte* a)
         "STM	%[r], {r4, r5, r6, r7, r8, r9, r10, r11}\n\t"
         "MOV	r0, #0x0\n\t"
         "ADD	sp, sp, #0xc0\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [n] "+r" (n), [a] "+r" (a)
         :
+#else
+        :
+        : [r] "r" (r), [n] "r" (n), [a] "r" (a)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
             "r3", "r12", "lr"
     );
@@ -3916,8 +4053,13 @@ WC_OMIT_FRAME_POINTER void fe_invert(fe r, const fe a)
         "LDR	%[a], [sp, #132]\n\t"
         "LDR	%[r], [sp, #128]\n\t"
         "ADD	sp, sp, #0x88\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "lr", "r12", "r2", "r3", "r4", "r5", "r6", "r7", "r8",
             "r9", "r10", "r11"
     );
@@ -4231,8 +4373,13 @@ WC_OMIT_FRAME_POINTER void fe_sq2(fe r, const fe a)
         "LDR	r0, [sp, #64]\n\t"
         "STM	r0, {r1, r2, r3, r4, r5, r6, r7, r8}\n\t"
         "ADD	sp, sp, #0x44\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "lr"
     );
 }
@@ -4394,8 +4541,13 @@ WC_OMIT_FRAME_POINTER void fe_sq2(fe r, const fe a)
         "STM	r12, {r0, r1, r2, r3, r4, r5, r6, r7}\n\t"
         "MOV	r0, r12\n\t"
         "MOV	r1, lr\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "lr"
     );
 }
@@ -4651,8 +4803,13 @@ WC_OMIT_FRAME_POINTER void fe_pow22523(fe r, const fe a)
         "LDR	%[a], [sp, #100]\n\t"
         "LDR	%[r], [sp, #96]\n\t"
         "ADD	sp, sp, #0x68\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "lr", "r12", "r2", "r3", "r4", "r5", "r6", "r7", "r8",
             "r9", "r10", "r11"
     );
@@ -4688,8 +4845,13 @@ WC_OMIT_FRAME_POINTER void ge_p1p1_to_p2(ge_p2 * r, const ge_p1p1 * p)
         "ADD	r0, r0, #0x40\n\t"
         "BL	fe_mul_op\n\t"
         "ADD	sp, sp, #0x8\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [p] "+r" (p)
         :
+#else
+        :
+        : [r] "r" (r), [p] "r" (p)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "lr", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9",
             "r10", "r11", "r12"
     );
@@ -4730,8 +4892,13 @@ WC_OMIT_FRAME_POINTER void ge_p1p1_to_p3(ge_p3 * r, const ge_p1p1 * p)
         "ADD	r0, r0, #0x60\n\t"
         "BL	fe_mul_op\n\t"
         "ADD	sp, sp, #0x8\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [p] "+r" (p)
         :
+#else
+        :
+        : [r] "r" (r), [p] "r" (p)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "lr", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9",
             "r10", "r11", "r12"
     );
@@ -4784,8 +4951,13 @@ WC_OMIT_FRAME_POINTER void ge_p2_dbl(ge_p1p1 * r, const ge_p2 * p)
         "MOV	r1, r0\n\t"
         "BL	fe_sub_op\n\t"
         "ADD	sp, sp, #0x8\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [p] "+r" (p)
         :
+#else
+        :
+        : [r] "r" (r), [p] "r" (p)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11", "r12", "lr"
     );
@@ -4875,8 +5047,13 @@ WC_OMIT_FRAME_POINTER void ge_madd(ge_p1p1 * r, const ge_p3 * p,
         "ADD	r1, r0, #0x20\n\t"
         "BL	fe_add_sub_op\n\t"
         "ADD	sp, sp, #0xc\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [p] "+r" (p), [q] "+r" (q)
         :
+#else
+        :
+        : [r] "r" (r), [p] "r" (p), [q] "r" (q)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11", "r12", "lr"
     );
@@ -4967,8 +5144,13 @@ WC_OMIT_FRAME_POINTER void ge_msub(ge_p1p1 * r, const ge_p3 * p,
         "ADD	r0, r0, #0x20\n\t"
         "BL	fe_add_sub_op\n\t"
         "ADD	sp, sp, #0xc\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [p] "+r" (p), [q] "+r" (q)
         :
+#else
+        :
+        : [r] "r" (r), [p] "r" (p), [q] "r" (q)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11", "r12", "lr"
     );
@@ -5059,8 +5241,13 @@ WC_OMIT_FRAME_POINTER void ge_add(ge_p1p1 * r, const ge_p3 * p,
         "ADD	r0, r0, #0x20\n\t"
         "BL	fe_add_sub_op\n\t"
         "ADD	sp, sp, #0x2c\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [p] "+r" (p), [q] "+r" (q)
         :
+#else
+        :
+        : [r] "r" (r), [p] "r" (p), [q] "r" (q)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11", "r12", "lr"
     );
@@ -5151,8 +5338,13 @@ WC_OMIT_FRAME_POINTER void ge_sub(ge_p1p1 * r, const ge_p3 * p,
         "ADD	r0, r0, #0x40\n\t"
         "BL	fe_add_sub_op\n\t"
         "ADD	sp, sp, #0x2c\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [p] "+r" (p), [q] "+r" (q)
         :
+#else
+        :
+        : [r] "r" (r), [p] "r" (p), [q] "r" (q)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11", "r12", "lr"
     );
@@ -5588,8 +5780,13 @@ WC_OMIT_FRAME_POINTER void sc_reduce(byte* s)
         "LDR	%[s], [sp, #52]\n\t"
         "STM	%[s], {r2, r3, r4, r5, r6, r7, r8, r9}\n\t"
         "ADD	sp, sp, #0x38\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [s] "+r" (s)
         :
+#else
+        :
+        : [s] "r" (s)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9",
             "r10", "r11", "r12", "lr"
     );
@@ -5894,8 +6091,13 @@ WC_OMIT_FRAME_POINTER void sc_reduce(byte* s)
         "LDR	%[s], [sp, #52]\n\t"
         "STM	%[s], {r2, r3, r4, r5, r6, r7, r8, r9}\n\t"
         "ADD	sp, sp, #0x38\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [s] "+r" (s)
         :
+#else
+        :
+        : [s] "r" (s)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9",
             "r10", "r11", "r12", "lr"
     );
@@ -6693,8 +6895,13 @@ WC_OMIT_FRAME_POINTER void sc_muladd(byte* s, const byte* a, const byte* b,
         "STR	r8, [%[s], #24]\n\t"
         "STR	r9, [%[s], #28]\n\t"
         "ADD	sp, sp, #0x50\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [s] "+r" (s), [a] "+r" (a), [b] "+r" (b), [c] "+r" (c)
         :
+#else
+        :
+        : [s] "r" (s), [a] "r" (a), [b] "r" (b), [c] "r" (c)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
             "r12", "lr"
     );
@@ -7134,8 +7341,13 @@ WC_OMIT_FRAME_POINTER void sc_muladd(byte* s, const byte* a, const byte* b,
         "STR	r8, [%[s], #24]\n\t"
         "STR	r9, [%[s], #28]\n\t"
         "ADD	sp, sp, #0x50\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [s] "+r" (s), [a] "+r" (a), [b] "+r" (b), [c] "+r" (c)
         :
+#else
+        :
+        : [s] "r" (s), [a] "r" (a), [b] "r" (b), [c] "r" (c)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
             "r12", "lr"
     );
diff --git a/wolfcrypt/src/port/arm/thumb2-mlkem-asm.S b/wolfcrypt/src/port/arm/thumb2-mlkem-asm.S
index 5dd14176559..42cd8622f69 100644
--- a/wolfcrypt/src/port/arm/thumb2-mlkem-asm.S
+++ b/wolfcrypt/src/port/arm/thumb2-mlkem-asm.S
@@ -33,139 +33,36 @@
 	.thumb
 	.syntax unified
 #ifdef WOLFSSL_WC_MLKEM
+#ifndef __APPLE__
 	.text
 	.type	L_mlkem_thumb2_ntt_zetas, %object
 	.size	L_mlkem_thumb2_ntt_zetas, 256
-	.align	4
+#else
+	.section	__DATA,__data
+#endif /* __APPLE__ */
+	/* 4-byte aligned, 32-bit aligned */
+#ifndef __APPLE__
+	.align	2
+#else
+	.p2align	2
+#endif /* __APPLE__ */
 L_mlkem_thumb2_ntt_zetas:
-	.short	0x8ed
-	.short	0xa0b
-	.short	0xb9a
-	.short	0x714
-	.short	0x5d5
-	.short	0x58e
-	.short	0x11f
-	.short	0xca
-	.short	0xc56
-	.short	0x26e
-	.short	0x629
-	.short	0xb6
-	.short	0x3c2
-	.short	0x84f
-	.short	0x73f
-	.short	0x5bc
-	.short	0x23d
-	.short	0x7d4
-	.short	0x108
-	.short	0x17f
-	.short	0x9c4
-	.short	0x5b2
-	.short	0x6bf
-	.short	0xc7f
-	.short	0xa58
-	.short	0x3f9
-	.short	0x2dc
-	.short	0x260
-	.short	0x6fb
-	.short	0x19b
-	.short	0xc34
-	.short	0x6de
-	.short	0x4c7
-	.short	0x28c
-	.short	0xad9
-	.short	0x3f7
-	.short	0x7f4
-	.short	0x5d3
-	.short	0xbe7
-	.short	0x6f9
-	.short	0x204
-	.short	0xcf9
-	.short	0xbc1
-	.short	0xa67
-	.short	0x6af
-	.short	0x877
-	.short	0x7e
-	.short	0x5bd
-	.short	0x9ac
-	.short	0xca7
-	.short	0xbf2
-	.short	0x33e
-	.short	0x6b
-	.short	0x774
-	.short	0xc0a
-	.short	0x94a
-	.short	0xb73
-	.short	0x3c1
-	.short	0x71d
-	.short	0xa2c
-	.short	0x1c0
-	.short	0x8d8
-	.short	0x2a5
-	.short	0x806
-	.short	0x8b2
-	.short	0x1ae
-	.short	0x22b
-	.short	0x34b
-	.short	0x81e
-	.short	0x367
-	.short	0x60e
-	.short	0x69
-	.short	0x1a6
-	.short	0x24b
-	.short	0xb1
-	.short	0xc16
-	.short	0xbde
-	.short	0xb35
-	.short	0x626
-	.short	0x675
-	.short	0xc0b
-	.short	0x30a
-	.short	0x487
-	.short	0xc6e
-	.short	0x9f8
-	.short	0x5cb
-	.short	0xaa7
-	.short	0x45f
-	.short	0x6cb
-	.short	0x284
-	.short	0x999
-	.short	0x15d
-	.short	0x1a2
-	.short	0x149
-	.short	0xc65
-	.short	0xcb6
-	.short	0x331
-	.short	0x449
-	.short	0x25b
-	.short	0x262
-	.short	0x52a
-	.short	0x7fc
-	.short	0x748
-	.short	0x180
-	.short	0x842
-	.short	0xc79
-	.short	0x4c2
-	.short	0x7ca
-	.short	0x997
-	.short	0xdc
-	.short	0x85e
-	.short	0x686
-	.short	0x860
-	.short	0x707
-	.short	0x803
-	.short	0x31a
-	.short	0x71b
-	.short	0x9ab
-	.short	0x99b
-	.short	0x1de
-	.short	0xc95
-	.short	0xbcd
-	.short	0x3e4
-	.short	0x3df
-	.short	0x3be
-	.short	0x74d
-	.short	0x5f2
-	.short	0x65c
+	.short	0x08ed,0x0a0b,0x0b9a,0x0714,0x05d5,0x058e,0x011f,0x00ca
+	.short	0x0c56,0x026e,0x0629,0x00b6,0x03c2,0x084f,0x073f,0x05bc
+	.short	0x023d,0x07d4,0x0108,0x017f,0x09c4,0x05b2,0x06bf,0x0c7f
+	.short	0x0a58,0x03f9,0x02dc,0x0260,0x06fb,0x019b,0x0c34,0x06de
+	.short	0x04c7,0x028c,0x0ad9,0x03f7,0x07f4,0x05d3,0x0be7,0x06f9
+	.short	0x0204,0x0cf9,0x0bc1,0x0a67,0x06af,0x0877,0x007e,0x05bd
+	.short	0x09ac,0x0ca7,0x0bf2,0x033e,0x006b,0x0774,0x0c0a,0x094a
+	.short	0x0b73,0x03c1,0x071d,0x0a2c,0x01c0,0x08d8,0x02a5,0x0806
+	.short	0x08b2,0x01ae,0x022b,0x034b,0x081e,0x0367,0x060e,0x0069
+	.short	0x01a6,0x024b,0x00b1,0x0c16,0x0bde,0x0b35,0x0626,0x0675
+	.short	0x0c0b,0x030a,0x0487,0x0c6e,0x09f8,0x05cb,0x0aa7,0x045f
+	.short	0x06cb,0x0284,0x0999,0x015d,0x01a2,0x0149,0x0c65,0x0cb6
+	.short	0x0331,0x0449,0x025b,0x0262,0x052a,0x07fc,0x0748,0x0180
+	.short	0x0842,0x0c79,0x04c2,0x07ca,0x0997,0x00dc,0x085e,0x0686
+	.short	0x0860,0x0707,0x0803,0x031a,0x071b,0x09ab,0x099b,0x01de
+	.short	0x0c95,0x0bcd,0x03e4,0x03df,0x03be,0x074d,0x05f2,0x065c
 	.text
 	.align	4
 	.globl	mlkem_thumb2_ntt
@@ -1425,139 +1322,36 @@ L_mlkem_thumb2_ntt_loop_567:
 	POP	{r4, r5, r6, r7, r8, r9, r10, r11, pc}
 	/* Cycle Count = 1270 */
 	.size	mlkem_thumb2_ntt,.-mlkem_thumb2_ntt
+#ifndef __APPLE__
 	.text
 	.type	L_mlkem_invntt_zetas_inv, %object
 	.size	L_mlkem_invntt_zetas_inv, 256
-	.align	4
+#else
+	.section	__DATA,__data
+#endif /* __APPLE__ */
+	/* 4-byte aligned, 32-bit aligned */
+#ifndef __APPLE__
+	.align	2
+#else
+	.p2align	2
+#endif /* __APPLE__ */
 L_mlkem_invntt_zetas_inv:
-	.short	0x6a5
-	.short	0x70f
-	.short	0x5b4
-	.short	0x943
-	.short	0x922
-	.short	0x91d
-	.short	0x134
-	.short	0x6c
-	.short	0xb23
-	.short	0x366
-	.short	0x356
-	.short	0x5e6
-	.short	0x9e7
-	.short	0x4fe
-	.short	0x5fa
-	.short	0x4a1
-	.short	0x67b
-	.short	0x4a3
-	.short	0xc25
-	.short	0x36a
-	.short	0x537
-	.short	0x83f
-	.short	0x88
-	.short	0x4bf
-	.short	0xb81
-	.short	0x5b9
-	.short	0x505
-	.short	0x7d7
-	.short	0xa9f
-	.short	0xaa6
-	.short	0x8b8
-	.short	0x9d0
-	.short	0x4b
-	.short	0x9c
-	.short	0xbb8
-	.short	0xb5f
-	.short	0xba4
-	.short	0x368
-	.short	0xa7d
-	.short	0x636
-	.short	0x8a2
-	.short	0x25a
-	.short	0x736
-	.short	0x309
-	.short	0x93
-	.short	0x87a
-	.short	0x9f7
-	.short	0xf6
-	.short	0x68c
-	.short	0x6db
-	.short	0x1cc
-	.short	0x123
-	.short	0xeb
-	.short	0xc50
-	.short	0xab6
-	.short	0xb5b
-	.short	0xc98
-	.short	0x6f3
-	.short	0x99a
-	.short	0x4e3
-	.short	0x9b6
-	.short	0xad6
-	.short	0xb53
-	.short	0x44f
-	.short	0x4fb
-	.short	0xa5c
-	.short	0x429
-	.short	0xb41
-	.short	0x2d5
-	.short	0x5e4
-	.short	0x940
-	.short	0x18e
-	.short	0x3b7
-	.short	0xf7
-	.short	0x58d
-	.short	0xc96
-	.short	0x9c3
-	.short	0x10f
-	.short	0x5a
-	.short	0x355
-	.short	0x744
-	.short	0xc83
-	.short	0x48a
-	.short	0x652
-	.short	0x29a
-	.short	0x140
-	.short	0x8
-	.short	0xafd
-	.short	0x608
-	.short	0x11a
-	.short	0x72e
-	.short	0x50d
-	.short	0x90a
-	.short	0x228
-	.short	0xa75
-	.short	0x83a
-	.short	0x623
-	.short	0xcd
-	.short	0xb66
-	.short	0x606
-	.short	0xaa1
-	.short	0xa25
-	.short	0x908
-	.short	0x2a9
-	.short	0x82
-	.short	0x642
-	.short	0x74f
-	.short	0x33d
-	.short	0xb82
-	.short	0xbf9
-	.short	0x52d
-	.short	0xac4
-	.short	0x745
-	.short	0x5c2
-	.short	0x4b2
-	.short	0x93f
-	.short	0xc4b
-	.short	0x6d8
-	.short	0xa93
-	.short	0xab
-	.short	0xc37
-	.short	0xbe2
-	.short	0x773
-	.short	0x72c
-	.short	0x5ed
-	.short	0x167
-	.short	0x2f6
-	.short	0x5a1
+	.short	0x06a5,0x070f,0x05b4,0x0943,0x0922,0x091d,0x0134,0x006c
+	.short	0x0b23,0x0366,0x0356,0x05e6,0x09e7,0x04fe,0x05fa,0x04a1
+	.short	0x067b,0x04a3,0x0c25,0x036a,0x0537,0x083f,0x0088,0x04bf
+	.short	0x0b81,0x05b9,0x0505,0x07d7,0x0a9f,0x0aa6,0x08b8,0x09d0
+	.short	0x004b,0x009c,0x0bb8,0x0b5f,0x0ba4,0x0368,0x0a7d,0x0636
+	.short	0x08a2,0x025a,0x0736,0x0309,0x0093,0x087a,0x09f7,0x00f6
+	.short	0x068c,0x06db,0x01cc,0x0123,0x00eb,0x0c50,0x0ab6,0x0b5b
+	.short	0x0c98,0x06f3,0x099a,0x04e3,0x09b6,0x0ad6,0x0b53,0x044f
+	.short	0x04fb,0x0a5c,0x0429,0x0b41,0x02d5,0x05e4,0x0940,0x018e
+	.short	0x03b7,0x00f7,0x058d,0x0c96,0x09c3,0x010f,0x005a,0x0355
+	.short	0x0744,0x0c83,0x048a,0x0652,0x029a,0x0140,0x0008,0x0afd
+	.short	0x0608,0x011a,0x072e,0x050d,0x090a,0x0228,0x0a75,0x083a
+	.short	0x0623,0x00cd,0x0b66,0x0606,0x0aa1,0x0a25,0x0908,0x02a9
+	.short	0x0082,0x0642,0x074f,0x033d,0x0b82,0x0bf9,0x052d,0x0ac4
+	.short	0x0745,0x05c2,0x04b2,0x093f,0x0c4b,0x06d8,0x0a93,0x00ab
+	.short	0x0c37,0x0be2,0x0773,0x072c,0x05ed,0x0167,0x02f6,0x05a1
 	.text
 	.align	4
 	.globl	mlkem_thumb2_invntt
@@ -3184,139 +2978,36 @@ L_mlkem_invntt_loop_321:
 	POP	{r4, r5, r6, r7, r8, r9, r10, r11, pc}
 	/* Cycle Count = 1629 */
 	.size	mlkem_thumb2_invntt,.-mlkem_thumb2_invntt
+#ifndef __APPLE__
 	.text
 	.type	L_mlkem_basemul_mont_zetas, %object
 	.size	L_mlkem_basemul_mont_zetas, 256
-	.align	4
+#else
+	.section	__DATA,__data
+#endif /* __APPLE__ */
+	/* 4-byte aligned, 32-bit aligned */
+#ifndef __APPLE__
+	.align	2
+#else
+	.p2align	2
+#endif /* __APPLE__ */
 L_mlkem_basemul_mont_zetas:
-	.short	0x8ed
-	.short	0xa0b
-	.short	0xb9a
-	.short	0x714
-	.short	0x5d5
-	.short	0x58e
-	.short	0x11f
-	.short	0xca
-	.short	0xc56
-	.short	0x26e
-	.short	0x629
-	.short	0xb6
-	.short	0x3c2
-	.short	0x84f
-	.short	0x73f
-	.short	0x5bc
-	.short	0x23d
-	.short	0x7d4
-	.short	0x108
-	.short	0x17f
-	.short	0x9c4
-	.short	0x5b2
-	.short	0x6bf
-	.short	0xc7f
-	.short	0xa58
-	.short	0x3f9
-	.short	0x2dc
-	.short	0x260
-	.short	0x6fb
-	.short	0x19b
-	.short	0xc34
-	.short	0x6de
-	.short	0x4c7
-	.short	0x28c
-	.short	0xad9
-	.short	0x3f7
-	.short	0x7f4
-	.short	0x5d3
-	.short	0xbe7
-	.short	0x6f9
-	.short	0x204
-	.short	0xcf9
-	.short	0xbc1
-	.short	0xa67
-	.short	0x6af
-	.short	0x877
-	.short	0x7e
-	.short	0x5bd
-	.short	0x9ac
-	.short	0xca7
-	.short	0xbf2
-	.short	0x33e
-	.short	0x6b
-	.short	0x774
-	.short	0xc0a
-	.short	0x94a
-	.short	0xb73
-	.short	0x3c1
-	.short	0x71d
-	.short	0xa2c
-	.short	0x1c0
-	.short	0x8d8
-	.short	0x2a5
-	.short	0x806
-	.short	0x8b2
-	.short	0x1ae
-	.short	0x22b
-	.short	0x34b
-	.short	0x81e
-	.short	0x367
-	.short	0x60e
-	.short	0x69
-	.short	0x1a6
-	.short	0x24b
-	.short	0xb1
-	.short	0xc16
-	.short	0xbde
-	.short	0xb35
-	.short	0x626
-	.short	0x675
-	.short	0xc0b
-	.short	0x30a
-	.short	0x487
-	.short	0xc6e
-	.short	0x9f8
-	.short	0x5cb
-	.short	0xaa7
-	.short	0x45f
-	.short	0x6cb
-	.short	0x284
-	.short	0x999
-	.short	0x15d
-	.short	0x1a2
-	.short	0x149
-	.short	0xc65
-	.short	0xcb6
-	.short	0x331
-	.short	0x449
-	.short	0x25b
-	.short	0x262
-	.short	0x52a
-	.short	0x7fc
-	.short	0x748
-	.short	0x180
-	.short	0x842
-	.short	0xc79
-	.short	0x4c2
-	.short	0x7ca
-	.short	0x997
-	.short	0xdc
-	.short	0x85e
-	.short	0x686
-	.short	0x860
-	.short	0x707
-	.short	0x803
-	.short	0x31a
-	.short	0x71b
-	.short	0x9ab
-	.short	0x99b
-	.short	0x1de
-	.short	0xc95
-	.short	0xbcd
-	.short	0x3e4
-	.short	0x3df
-	.short	0x3be
-	.short	0x74d
-	.short	0x5f2
-	.short	0x65c
+	.short	0x08ed,0x0a0b,0x0b9a,0x0714,0x05d5,0x058e,0x011f,0x00ca
+	.short	0x0c56,0x026e,0x0629,0x00b6,0x03c2,0x084f,0x073f,0x05bc
+	.short	0x023d,0x07d4,0x0108,0x017f,0x09c4,0x05b2,0x06bf,0x0c7f
+	.short	0x0a58,0x03f9,0x02dc,0x0260,0x06fb,0x019b,0x0c34,0x06de
+	.short	0x04c7,0x028c,0x0ad9,0x03f7,0x07f4,0x05d3,0x0be7,0x06f9
+	.short	0x0204,0x0cf9,0x0bc1,0x0a67,0x06af,0x0877,0x007e,0x05bd
+	.short	0x09ac,0x0ca7,0x0bf2,0x033e,0x006b,0x0774,0x0c0a,0x094a
+	.short	0x0b73,0x03c1,0x071d,0x0a2c,0x01c0,0x08d8,0x02a5,0x0806
+	.short	0x08b2,0x01ae,0x022b,0x034b,0x081e,0x0367,0x060e,0x0069
+	.short	0x01a6,0x024b,0x00b1,0x0c16,0x0bde,0x0b35,0x0626,0x0675
+	.short	0x0c0b,0x030a,0x0487,0x0c6e,0x09f8,0x05cb,0x0aa7,0x045f
+	.short	0x06cb,0x0284,0x0999,0x015d,0x01a2,0x0149,0x0c65,0x0cb6
+	.short	0x0331,0x0449,0x025b,0x0262,0x052a,0x07fc,0x0748,0x0180
+	.short	0x0842,0x0c79,0x04c2,0x07ca,0x0997,0x00dc,0x085e,0x0686
+	.short	0x0860,0x0707,0x0803,0x031a,0x071b,0x09ab,0x099b,0x01de
+	.short	0x0c95,0x0bcd,0x03e4,0x03df,0x03be,0x074d,0x05f2,0x065c
 	.text
 	.align	4
 	.globl	mlkem_thumb2_basemul_mont
@@ -3896,6 +3587,6 @@ L_mlkem_thumb2_rej_uniform_done:
 #endif /* WOLFSSL_ARMASM */
 
 #if defined(__linux__) && defined(__ELF__)
-.section        .note.GNU-stack,"",%progbits
+.section	.note.GNU-stack,"",%progbits
 #endif
 #endif /* !WOLFSSL_ARMASM_INLINE */
diff --git a/wolfcrypt/src/port/arm/thumb2-mlkem-asm_c.c b/wolfcrypt/src/port/arm/thumb2-mlkem-asm_c.c
index 40a55b99238..5f45fc70518 100644
--- a/wolfcrypt/src/port/arm/thumb2-mlkem-asm_c.c
+++ b/wolfcrypt/src/port/arm/thumb2-mlkem-asm_c.c
@@ -41,11 +41,16 @@
 #define __asm__        __asm
 #define __volatile__   volatile
 #endif /* __KEIL__ */
+#ifdef __ghs__
+#define __asm__        __asm
+#define __volatile__
+#define WOLFSSL_NO_VAR_ASSIGN_REG
+#endif /* __ghs__ */
 
 #include <wolfssl/wolfcrypt/wc_mlkem.h>
 
 #ifdef WOLFSSL_WC_MLKEM
-XALIGNED(16) static const word16 L_mlkem_thumb2_ntt_zetas[] = {
+XALIGNED(4) static const word16 L_mlkem_thumb2_ntt_zetas[] = {
     0x08ed, 0x0a0b, 0x0b9a, 0x0714, 0x05d5, 0x058e, 0x011f, 0x00ca,
     0x0c56, 0x026e, 0x0629, 0x00b6, 0x03c2, 0x084f, 0x073f, 0x05bc,
     0x023d, 0x07d4, 0x0108, 0x017f, 0x09c4, 0x05b2, 0x06bf, 0x0c7f,
@@ -74,11 +79,9 @@ WC_OMIT_FRAME_POINTER void mlkem_thumb2_ntt(sword16* r)
     register sword16* r __asm__ ("r0") = (sword16*)r_p;
     register word16* L_mlkem_thumb2_ntt_zetas_c __asm__ ("r1") =
         (word16*)&L_mlkem_thumb2_ntt_zetas;
-
 #else
     register word16* L_mlkem_thumb2_ntt_zetas_c =
         (word16*)&L_mlkem_thumb2_ntt_zetas;
-
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -1360,15 +1363,21 @@ WC_OMIT_FRAME_POINTER void mlkem_thumb2_ntt(sword16* r)
         "BNE.N	L_mlkem_thumb2_ntt_loop_567_%=\n\t"
 #endif
         "ADD	sp, sp, #0x8\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r),
           [L_mlkem_thumb2_ntt_zetas] "+r" (L_mlkem_thumb2_ntt_zetas_c)
         :
+#else
+        :
+        : [r] "r" (r),
+          [L_mlkem_thumb2_ntt_zetas] "r" (L_mlkem_thumb2_ntt_zetas_c)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11", "r12", "lr"
     );
 }
 
-XALIGNED(16) static const word16 L_mlkem_invntt_zetas_inv[] = {
+XALIGNED(4) static const word16 L_mlkem_invntt_zetas_inv[] = {
     0x06a5, 0x070f, 0x05b4, 0x0943, 0x0922, 0x091d, 0x0134, 0x006c,
     0x0b23, 0x0366, 0x0356, 0x05e6, 0x09e7, 0x04fe, 0x05fa, 0x04a1,
     0x067b, 0x04a3, 0x0c25, 0x036a, 0x0537, 0x083f, 0x0088, 0x04bf,
@@ -1397,11 +1406,9 @@ WC_OMIT_FRAME_POINTER void mlkem_thumb2_invntt(sword16* r)
     register sword16* r __asm__ ("r0") = (sword16*)r_p;
     register word16* L_mlkem_invntt_zetas_inv_c __asm__ ("r1") =
         (word16*)&L_mlkem_invntt_zetas_inv;
-
 #else
     register word16* L_mlkem_invntt_zetas_inv_c =
         (word16*)&L_mlkem_invntt_zetas_inv;
-
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -3050,15 +3057,21 @@ WC_OMIT_FRAME_POINTER void mlkem_thumb2_invntt(sword16* r)
         "BNE.N	L_mlkem_invntt_loop_321_%=\n\t"
 #endif
         "ADD	sp, sp, #0x8\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r),
           [L_mlkem_invntt_zetas_inv] "+r" (L_mlkem_invntt_zetas_inv_c)
         :
+#else
+        :
+        : [r] "r" (r),
+          [L_mlkem_invntt_zetas_inv] "r" (L_mlkem_invntt_zetas_inv_c)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11", "r12", "lr"
     );
 }
 
-XALIGNED(16) static const word16 L_mlkem_basemul_mont_zetas[] = {
+XALIGNED(4) static const word16 L_mlkem_basemul_mont_zetas[] = {
     0x08ed, 0x0a0b, 0x0b9a, 0x0714, 0x05d5, 0x058e, 0x011f, 0x00ca,
     0x0c56, 0x026e, 0x0629, 0x00b6, 0x03c2, 0x084f, 0x073f, 0x05bc,
     0x023d, 0x07d4, 0x0108, 0x017f, 0x09c4, 0x05b2, 0x06bf, 0x0c7f,
@@ -3091,11 +3104,9 @@ WC_OMIT_FRAME_POINTER void mlkem_thumb2_basemul_mont(sword16* r,
     register const sword16* b __asm__ ("r2") = (const sword16*)b_p;
     register word16* L_mlkem_basemul_mont_zetas_c __asm__ ("r3") =
         (word16*)&L_mlkem_basemul_mont_zetas;
-
 #else
     register word16* L_mlkem_basemul_mont_zetas_c =
         (word16*)&L_mlkem_basemul_mont_zetas;
-
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -3220,9 +3231,15 @@ WC_OMIT_FRAME_POINTER void mlkem_thumb2_basemul_mont(sword16* r,
 #else
         "BNE.N	L_mlkem_basemul_mont_loop_%=\n\t"
 #endif
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b),
           [L_mlkem_basemul_mont_zetas] "+r" (L_mlkem_basemul_mont_zetas_c)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b),
+          [L_mlkem_basemul_mont_zetas] "r" (L_mlkem_basemul_mont_zetas_c)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
             "r12", "lr"
     );
@@ -3242,11 +3259,9 @@ WC_OMIT_FRAME_POINTER void mlkem_thumb2_basemul_mont_add(sword16* r,
     register const sword16* b __asm__ ("r2") = (const sword16*)b_p;
     register word16* L_mlkem_basemul_mont_zetas_c __asm__ ("r3") =
         (word16*)&L_mlkem_basemul_mont_zetas;
-
 #else
     register word16* L_mlkem_basemul_mont_zetas_c =
         (word16*)&L_mlkem_basemul_mont_zetas;
-
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -3383,9 +3398,15 @@ WC_OMIT_FRAME_POINTER void mlkem_thumb2_basemul_mont_add(sword16* r,
 #else
         "BNE.N	L_mlkem_thumb2_basemul_mont_add_loop_%=\n\t"
 #endif
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b),
           [L_mlkem_basemul_mont_zetas] "+r" (L_mlkem_basemul_mont_zetas_c)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b),
+          [L_mlkem_basemul_mont_zetas] "r" (L_mlkem_basemul_mont_zetas_c)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
             "r12", "lr"
     );
@@ -3401,11 +3422,9 @@ WC_OMIT_FRAME_POINTER void mlkem_thumb2_csubq(sword16* p)
     register sword16* p __asm__ ("r0") = (sword16*)p_p;
     register word16* L_mlkem_basemul_mont_zetas_c __asm__ ("r1") =
         (word16*)&L_mlkem_basemul_mont_zetas;
-
 #else
     register word16* L_mlkem_basemul_mont_zetas_c =
         (word16*)&L_mlkem_basemul_mont_zetas;
-
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -3496,9 +3515,15 @@ WC_OMIT_FRAME_POINTER void mlkem_thumb2_csubq(sword16* p)
 #else
         "BNE.N	L_mlkem_thumb2_csubq_loop_%=\n\t"
 #endif
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [p] "+r" (p),
           [L_mlkem_basemul_mont_zetas] "+r" (L_mlkem_basemul_mont_zetas_c)
         :
+#else
+        :
+        : [p] "r" (p),
+          [L_mlkem_basemul_mont_zetas] "r" (L_mlkem_basemul_mont_zetas_c)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11", "r12", "lr"
     );
@@ -3519,11 +3544,9 @@ WC_OMIT_FRAME_POINTER unsigned int mlkem_thumb2_rej_uniform(sword16* p,
     register unsigned int rLen __asm__ ("r3") = (unsigned int)rLen_p;
     register word16* L_mlkem_basemul_mont_zetas_c __asm__ ("r4") =
         (word16*)&L_mlkem_basemul_mont_zetas;
-
 #else
     register word16* L_mlkem_basemul_mont_zetas_c =
         (word16*)&L_mlkem_basemul_mont_zetas;
-
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -3847,9 +3870,15 @@ WC_OMIT_FRAME_POINTER unsigned int mlkem_thumb2_rej_uniform(sword16* p,
     "L_mlkem_thumb2_rej_uniform_done_%=:\n\t"
 #endif
         "LSR	r0, r9, #1\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [p] "+r" (p), [len] "+r" (len), [r] "+r" (r), [rLen] "+r" (rLen),
           [L_mlkem_basemul_mont_zetas] "+r" (L_mlkem_basemul_mont_zetas_c)
         :
+#else
+        :
+        : [p] "r" (p), [len] "r" (len), [r] "r" (r), [rLen] "r" (rLen),
+          [L_mlkem_basemul_mont_zetas] "r" (L_mlkem_basemul_mont_zetas_c)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (word32)(size_t)p;
diff --git a/wolfcrypt/src/port/arm/thumb2-poly1305-asm.S b/wolfcrypt/src/port/arm/thumb2-poly1305-asm.S
index 9ac9aca9d9c..2bd38c14b7e 100644
--- a/wolfcrypt/src/port/arm/thumb2-poly1305-asm.S
+++ b/wolfcrypt/src/port/arm/thumb2-poly1305-asm.S
@@ -256,15 +256,21 @@ L_poly1305_thumb2_16_done:
 	POP	{r4, r5, r6, r7, r8, r9, r10, r11, pc}
 	/* Cycle Count = 250 */
 	.size	poly1305_blocks_thumb2_16,.-poly1305_blocks_thumb2_16
+#ifndef __APPLE__
 	.text
 	.type	L_poly1305_thumb2_clamp, %object
 	.size	L_poly1305_thumb2_clamp, 16
-	.align	4
+#else
+	.section	__DATA,__data
+#endif /* __APPLE__ */
+	/* 8-byte aligned, 64-bit aligned */
+#ifndef __APPLE__
+	.align	3
+#else
+	.p2align	3
+#endif /* __APPLE__ */
 L_poly1305_thumb2_clamp:
-	.word	0xfffffff
-	.word	0xffffffc
-	.word	0xffffffc
-	.word	0xffffffc
+	.long	0x0fffffff,0x0ffffffc,0x0ffffffc,0x0ffffffc
 	.text
 	.align	4
 	.globl	poly1305_set_key
@@ -362,6 +368,6 @@ poly1305_final:
 #endif /* WOLFSSL_ARMASM */
 
 #if defined(__linux__) && defined(__ELF__)
-.section        .note.GNU-stack,"",%progbits
+.section	.note.GNU-stack,"",%progbits
 #endif
 #endif /* !WOLFSSL_ARMASM_INLINE */
diff --git a/wolfcrypt/src/port/arm/thumb2-poly1305-asm_c.c b/wolfcrypt/src/port/arm/thumb2-poly1305-asm_c.c
index 515c955b372..c4b607a5c06 100644
--- a/wolfcrypt/src/port/arm/thumb2-poly1305-asm_c.c
+++ b/wolfcrypt/src/port/arm/thumb2-poly1305-asm_c.c
@@ -41,6 +41,11 @@
 #define __asm__        __asm
 #define __volatile__   volatile
 #endif /* __KEIL__ */
+#ifdef __ghs__
+#define __asm__        __asm
+#define __volatile__
+#define WOLFSSL_NO_VAR_ASSIGN_REG
+#endif /* __ghs__ */
 
 #ifdef HAVE_POLY1305
 #include <wolfssl/wolfcrypt/poly1305.h>
@@ -289,15 +294,21 @@ WC_OMIT_FRAME_POINTER void poly1305_blocks_thumb2_16(Poly1305* ctx,
     "L_poly1305_thumb2_16_done_%=:\n\t"
 #endif
         "ADD	sp, sp, #0x1c\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [ctx] "+r" (ctx), [m] "+r" (m), [len] "+r" (len),
           [notLast] "+r" (notLast)
         :
+#else
+        :
+        : [ctx] "r" (ctx), [m] "r" (m), [len] "r" (len),
+          [notLast] "r" (notLast)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
             "r12", "lr"
     );
 }
 
-XALIGNED(16) static const word32 L_poly1305_thumb2_clamp[] = {
+XALIGNED(8) static const word32 L_poly1305_thumb2_clamp[] = {
     0x0fffffff, 0x0ffffffc, 0x0ffffffc, 0x0ffffffc,
 };
 
@@ -312,11 +323,9 @@ WC_OMIT_FRAME_POINTER void poly1305_set_key(Poly1305* ctx, const byte* key)
     register const byte* key __asm__ ("r1") = (const byte*)key_p;
     register word32* L_poly1305_thumb2_clamp_c __asm__ ("r2") =
         (word32*)&L_poly1305_thumb2_clamp;
-
 #else
     register word32* L_poly1305_thumb2_clamp_c =
         (word32*)&L_poly1305_thumb2_clamp;
-
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -351,9 +360,15 @@ WC_OMIT_FRAME_POINTER void poly1305_set_key(Poly1305* ctx, const byte* key)
         "STM	r10, {r5, r6, r7, r8, r9}\n\t"
         /* Zero leftover */
         "STR	r5, [%[ctx], #52]\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [ctx] "+r" (ctx), [key] "+r" (key),
           [L_poly1305_thumb2_clamp] "+r" (L_poly1305_thumb2_clamp_c)
         :
+#else
+        :
+        : [ctx] "r" (ctx), [key] "r" (key),
+          [L_poly1305_thumb2_clamp] "r" (L_poly1305_thumb2_clamp_c)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
 }
@@ -413,8 +428,13 @@ WC_OMIT_FRAME_POINTER void poly1305_final(Poly1305* ctx, byte* mac)
         /* Zero out padding. */
         "ADD	r11, %[ctx], #0x24\n\t"
         "STM	r11, {r2, r3, r4, r5}\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [ctx] "+r" (ctx), [mac] "+r" (mac)
         :
+#else
+        :
+        : [ctx] "r" (ctx), [mac] "r" (mac)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11"
     );
diff --git a/wolfcrypt/src/port/arm/thumb2-sha256-asm.S b/wolfcrypt/src/port/arm/thumb2-sha256-asm.S
index cb4252f3257..3daa7a93e1b 100644
--- a/wolfcrypt/src/port/arm/thumb2-sha256-asm.S
+++ b/wolfcrypt/src/port/arm/thumb2-sha256-asm.S
@@ -34,75 +34,36 @@
 	.syntax unified
 #ifndef NO_SHA256
 #ifdef WOLFSSL_ARMASM_NO_NEON
+#ifndef __APPLE__
 	.text
 	.type	L_SHA256_transform_len_k, %object
 	.size	L_SHA256_transform_len_k, 256
-	.align	4
+#else
+	.section	__DATA,__data
+#endif /* __APPLE__ */
+	/* 8-byte aligned, 64-bit aligned */
+#ifndef __APPLE__
+	.align	3
+#else
+	.p2align	3
+#endif /* __APPLE__ */
 L_SHA256_transform_len_k:
-	.word	0x428a2f98
-	.word	0x71374491
-	.word	0xb5c0fbcf
-	.word	0xe9b5dba5
-	.word	0x3956c25b
-	.word	0x59f111f1
-	.word	0x923f82a4
-	.word	0xab1c5ed5
-	.word	0xd807aa98
-	.word	0x12835b01
-	.word	0x243185be
-	.word	0x550c7dc3
-	.word	0x72be5d74
-	.word	0x80deb1fe
-	.word	0x9bdc06a7
-	.word	0xc19bf174
-	.word	0xe49b69c1
-	.word	0xefbe4786
-	.word	0xfc19dc6
-	.word	0x240ca1cc
-	.word	0x2de92c6f
-	.word	0x4a7484aa
-	.word	0x5cb0a9dc
-	.word	0x76f988da
-	.word	0x983e5152
-	.word	0xa831c66d
-	.word	0xb00327c8
-	.word	0xbf597fc7
-	.word	0xc6e00bf3
-	.word	0xd5a79147
-	.word	0x6ca6351
-	.word	0x14292967
-	.word	0x27b70a85
-	.word	0x2e1b2138
-	.word	0x4d2c6dfc
-	.word	0x53380d13
-	.word	0x650a7354
-	.word	0x766a0abb
-	.word	0x81c2c92e
-	.word	0x92722c85
-	.word	0xa2bfe8a1
-	.word	0xa81a664b
-	.word	0xc24b8b70
-	.word	0xc76c51a3
-	.word	0xd192e819
-	.word	0xd6990624
-	.word	0xf40e3585
-	.word	0x106aa070
-	.word	0x19a4c116
-	.word	0x1e376c08
-	.word	0x2748774c
-	.word	0x34b0bcb5
-	.word	0x391c0cb3
-	.word	0x4ed8aa4a
-	.word	0x5b9cca4f
-	.word	0x682e6ff3
-	.word	0x748f82ee
-	.word	0x78a5636f
-	.word	0x84c87814
-	.word	0x8cc70208
-	.word	0x90befffa
-	.word	0xa4506ceb
-	.word	0xbef9a3f7
-	.word	0xc67178f2
+	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 	.text
 	.align	4
 	.globl	Transform_Sha256_Len_base
@@ -2365,6 +2326,6 @@ L_SHA256_transform_len_blk_end_15:
 #endif /* WOLFSSL_ARMASM */
 
 #if defined(__linux__) && defined(__ELF__)
-.section        .note.GNU-stack,"",%progbits
+.section	.note.GNU-stack,"",%progbits
 #endif
 #endif /* !WOLFSSL_ARMASM_INLINE */
diff --git a/wolfcrypt/src/port/arm/thumb2-sha256-asm_c.c b/wolfcrypt/src/port/arm/thumb2-sha256-asm_c.c
index 2a5643d9a39..cb657fcc71c 100644
--- a/wolfcrypt/src/port/arm/thumb2-sha256-asm_c.c
+++ b/wolfcrypt/src/port/arm/thumb2-sha256-asm_c.c
@@ -41,12 +41,17 @@
 #define __asm__        __asm
 #define __volatile__   volatile
 #endif /* __KEIL__ */
+#ifdef __ghs__
+#define __asm__        __asm
+#define __volatile__
+#define WOLFSSL_NO_VAR_ASSIGN_REG
+#endif /* __ghs__ */
 
 #ifndef NO_SHA256
 #include <wolfssl/wolfcrypt/sha256.h>
 
 #ifdef WOLFSSL_ARMASM_NO_NEON
-XALIGNED(16) static const word32 L_SHA256_transform_len_k[] = {
+XALIGNED(8) static const word32 L_SHA256_transform_len_k[] = {
     0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
     0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
     0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
@@ -65,7 +70,8 @@ XALIGNED(16) static const word32 L_SHA256_transform_len_k[] = {
     0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
 };
 
-void Transform_Sha256_Len_base(wc_Sha256* sha256, const byte* data, word32 len);
+void Transform_Sha256_Len_base(wc_Sha256* sha256_p, const byte* data_p,
+    word32 len_p);
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
 WC_OMIT_FRAME_POINTER void Transform_Sha256_Len_base(wc_Sha256* sha256_p,
     const byte* data_p, word32 len_p)
@@ -80,11 +86,9 @@ WC_OMIT_FRAME_POINTER void Transform_Sha256_Len_base(wc_Sha256* sha256,
     register word32 len __asm__ ("r2") = (word32)len_p;
     register word32* L_SHA256_transform_len_k_c __asm__ ("r3") =
         (word32*)&L_SHA256_transform_len_k;
-
 #else
     register word32* L_SHA256_transform_len_k_c =
         (word32*)&L_SHA256_transform_len_k;
-
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -2468,9 +2472,15 @@ WC_OMIT_FRAME_POINTER void Transform_Sha256_Len_base(wc_Sha256* sha256,
         "BNE.W	L_SHA256_transform_len_begin_%=\n\t"
 #endif
         "ADD	sp, sp, #0xc0\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [sha256] "+r" (sha256), [data] "+r" (data), [len] "+r" (len),
           [L_SHA256_transform_len_k] "+r" (L_SHA256_transform_len_k_c)
         :
+#else
+        :
+        : [sha256] "r" (sha256), [data] "r" (data), [len] "r" (len),
+          [L_SHA256_transform_len_k] "r" (L_SHA256_transform_len_k_c)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
             "r12"
     );
diff --git a/wolfcrypt/src/port/arm/thumb2-sha3-asm.S b/wolfcrypt/src/port/arm/thumb2-sha3-asm.S
index 511883d5f11..ab4254dee9d 100644
--- a/wolfcrypt/src/port/arm/thumb2-sha3-asm.S
+++ b/wolfcrypt/src/port/arm/thumb2-sha3-asm.S
@@ -33,59 +33,32 @@
 	.thumb
 	.syntax unified
 #ifdef WOLFSSL_SHA3
+#ifndef __APPLE__
 	.text
 	.type	L_sha3_thumb2_rt, %object
 	.size	L_sha3_thumb2_rt, 192
-	.align	8
+#else
+	.section	__DATA,__data
+#endif /* __APPLE__ */
+	/* 16-byte aligned, 128-bit aligned */
+#ifndef __APPLE__
+	.align	4
+#else
+	.p2align	4
+#endif /* __APPLE__ */
 L_sha3_thumb2_rt:
-	.word	0x1
-	.word	0x0
-	.word	0x8082
-	.word	0x0
-	.word	0x808a
-	.word	0x80000000
-	.word	0x80008000
-	.word	0x80000000
-	.word	0x808b
-	.word	0x0
-	.word	0x80000001
-	.word	0x0
-	.word	0x80008081
-	.word	0x80000000
-	.word	0x8009
-	.word	0x80000000
-	.word	0x8a
-	.word	0x0
-	.word	0x88
-	.word	0x0
-	.word	0x80008009
-	.word	0x0
-	.word	0x8000000a
-	.word	0x0
-	.word	0x8000808b
-	.word	0x0
-	.word	0x8b
-	.word	0x80000000
-	.word	0x8089
-	.word	0x80000000
-	.word	0x8003
-	.word	0x80000000
-	.word	0x8002
-	.word	0x80000000
-	.word	0x80
-	.word	0x80000000
-	.word	0x800a
-	.word	0x0
-	.word	0x8000000a
-	.word	0x80000000
-	.word	0x80008081
-	.word	0x80000000
-	.word	0x8080
-	.word	0x80000000
-	.word	0x80000001
-	.word	0x0
-	.word	0x80008008
-	.word	0x80000000
+	.quad	0x0000000000000001,0x0000000000008082
+	.quad	0x800000000000808a,0x8000000080008000
+	.quad	0x000000000000808b,0x0000000080000001
+	.quad	0x8000000080008081,0x8000000000008009
+	.quad	0x000000000000008a,0x0000000000000088
+	.quad	0x0000000080008009,0x000000008000000a
+	.quad	0x000000008000808b,0x800000000000008b
+	.quad	0x8000000000008089,0x8000000000008003
+	.quad	0x8000000000008002,0x8000000000000080
+	.quad	0x000000000000800a,0x800000008000000a
+	.quad	0x8000000080008081,0x8000000000008080
+	.quad	0x0000000080000001,0x8000000080008008
 	.text
 	.align	4
 	.globl	BlockSha3
@@ -1169,6 +1142,6 @@ L_sha3_thumb2_begin:
 #endif /* WOLFSSL_ARMASM */
 
 #if defined(__linux__) && defined(__ELF__)
-.section        .note.GNU-stack,"",%progbits
+.section	.note.GNU-stack,"",%progbits
 #endif
 #endif /* !WOLFSSL_ARMASM_INLINE */
diff --git a/wolfcrypt/src/port/arm/thumb2-sha3-asm_c.c b/wolfcrypt/src/port/arm/thumb2-sha3-asm_c.c
index 04da0699eab..e0c6d065d1a 100644
--- a/wolfcrypt/src/port/arm/thumb2-sha3-asm_c.c
+++ b/wolfcrypt/src/port/arm/thumb2-sha3-asm_c.c
@@ -41,9 +41,14 @@
 #define __asm__        __asm
 #define __volatile__   volatile
 #endif /* __KEIL__ */
+#ifdef __ghs__
+#define __asm__        __asm
+#define __volatile__
+#define WOLFSSL_NO_VAR_ASSIGN_REG
+#endif /* __ghs__ */
 
 #ifdef WOLFSSL_SHA3
-static const word64 L_sha3_thumb2_rt[] = {
+XALIGNED(16) static const word64 L_sha3_thumb2_rt[] = {
     0x0000000000000001UL, 0x0000000000008082UL,
     0x800000000000808aUL, 0x8000000080008000UL,
     0x000000000000808bUL, 0x0000000080000001UL,
@@ -70,10 +75,8 @@ WC_OMIT_FRAME_POINTER void BlockSha3(word64* state)
     register word64* state __asm__ ("r0") = (word64*)state_p;
     register word64* L_sha3_thumb2_rt_c __asm__ ("r1") =
         (word64*)&L_sha3_thumb2_rt;
-
 #else
     register word64* L_sha3_thumb2_rt_c = (word64*)&L_sha3_thumb2_rt;
-
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -1153,8 +1156,13 @@ WC_OMIT_FRAME_POINTER void BlockSha3(word64* state)
         "BNE.W	L_sha3_thumb2_begin_%=\n\t"
 #endif
         "ADD	sp, sp, #0xcc\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [state] "+r" (state), [L_sha3_thumb2_rt] "+r" (L_sha3_thumb2_rt_c)
         :
+#else
+        :
+        : [state] "r" (state), [L_sha3_thumb2_rt] "r" (L_sha3_thumb2_rt_c)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11", "r12", "lr"
     );
diff --git a/wolfcrypt/src/port/arm/thumb2-sha512-asm.S b/wolfcrypt/src/port/arm/thumb2-sha512-asm.S
index f36baae5c28..f05da4f8cad 100644
--- a/wolfcrypt/src/port/arm/thumb2-sha512-asm.S
+++ b/wolfcrypt/src/port/arm/thumb2-sha512-asm.S
@@ -34,171 +34,60 @@
 	.syntax unified
 #if defined(WOLFSSL_SHA512) || defined(WOLFSSL_SHA384)
 #ifdef WOLFSSL_ARMASM_NO_NEON
+#ifndef __APPLE__
 	.text
 	.type	L_SHA512_transform_len_k, %object
 	.size	L_SHA512_transform_len_k, 640
-	.align	8
+#else
+	.section	__DATA,__data
+#endif /* __APPLE__ */
+	/* 16-byte aligned, 128-bit aligned */
+#ifndef __APPLE__
+	.align	4
+#else
+	.p2align	4
+#endif /* __APPLE__ */
 L_SHA512_transform_len_k:
-	.word	0xd728ae22
-	.word	0x428a2f98
-	.word	0x23ef65cd
-	.word	0x71374491
-	.word	0xec4d3b2f
-	.word	0xb5c0fbcf
-	.word	0x8189dbbc
-	.word	0xe9b5dba5
-	.word	0xf348b538
-	.word	0x3956c25b
-	.word	0xb605d019
-	.word	0x59f111f1
-	.word	0xaf194f9b
-	.word	0x923f82a4
-	.word	0xda6d8118
-	.word	0xab1c5ed5
-	.word	0xa3030242
-	.word	0xd807aa98
-	.word	0x45706fbe
-	.word	0x12835b01
-	.word	0x4ee4b28c
-	.word	0x243185be
-	.word	0xd5ffb4e2
-	.word	0x550c7dc3
-	.word	0xf27b896f
-	.word	0x72be5d74
-	.word	0x3b1696b1
-	.word	0x80deb1fe
-	.word	0x25c71235
-	.word	0x9bdc06a7
-	.word	0xcf692694
-	.word	0xc19bf174
-	.word	0x9ef14ad2
-	.word	0xe49b69c1
-	.word	0x384f25e3
-	.word	0xefbe4786
-	.word	0x8b8cd5b5
-	.word	0xfc19dc6
-	.word	0x77ac9c65
-	.word	0x240ca1cc
-	.word	0x592b0275
-	.word	0x2de92c6f
-	.word	0x6ea6e483
-	.word	0x4a7484aa
-	.word	0xbd41fbd4
-	.word	0x5cb0a9dc
-	.word	0x831153b5
-	.word	0x76f988da
-	.word	0xee66dfab
-	.word	0x983e5152
-	.word	0x2db43210
-	.word	0xa831c66d
-	.word	0x98fb213f
-	.word	0xb00327c8
-	.word	0xbeef0ee4
-	.word	0xbf597fc7
-	.word	0x3da88fc2
-	.word	0xc6e00bf3
-	.word	0x930aa725
-	.word	0xd5a79147
-	.word	0xe003826f
-	.word	0x6ca6351
-	.word	0xa0e6e70
-	.word	0x14292967
-	.word	0x46d22ffc
-	.word	0x27b70a85
-	.word	0x5c26c926
-	.word	0x2e1b2138
-	.word	0x5ac42aed
-	.word	0x4d2c6dfc
-	.word	0x9d95b3df
-	.word	0x53380d13
-	.word	0x8baf63de
-	.word	0x650a7354
-	.word	0x3c77b2a8
-	.word	0x766a0abb
-	.word	0x47edaee6
-	.word	0x81c2c92e
-	.word	0x1482353b
-	.word	0x92722c85
-	.word	0x4cf10364
-	.word	0xa2bfe8a1
-	.word	0xbc423001
-	.word	0xa81a664b
-	.word	0xd0f89791
-	.word	0xc24b8b70
-	.word	0x654be30
-	.word	0xc76c51a3
-	.word	0xd6ef5218
-	.word	0xd192e819
-	.word	0x5565a910
-	.word	0xd6990624
-	.word	0x5771202a
-	.word	0xf40e3585
-	.word	0x32bbd1b8
-	.word	0x106aa070
-	.word	0xb8d2d0c8
-	.word	0x19a4c116
-	.word	0x5141ab53
-	.word	0x1e376c08
-	.word	0xdf8eeb99
-	.word	0x2748774c
-	.word	0xe19b48a8
-	.word	0x34b0bcb5
-	.word	0xc5c95a63
-	.word	0x391c0cb3
-	.word	0xe3418acb
-	.word	0x4ed8aa4a
-	.word	0x7763e373
-	.word	0x5b9cca4f
-	.word	0xd6b2b8a3
-	.word	0x682e6ff3
-	.word	0x5defb2fc
-	.word	0x748f82ee
-	.word	0x43172f60
-	.word	0x78a5636f
-	.word	0xa1f0ab72
-	.word	0x84c87814
-	.word	0x1a6439ec
-	.word	0x8cc70208
-	.word	0x23631e28
-	.word	0x90befffa
-	.word	0xde82bde9
-	.word	0xa4506ceb
-	.word	0xb2c67915
-	.word	0xbef9a3f7
-	.word	0xe372532b
-	.word	0xc67178f2
-	.word	0xea26619c
-	.word	0xca273ece
-	.word	0x21c0c207
-	.word	0xd186b8c7
-	.word	0xcde0eb1e
-	.word	0xeada7dd6
-	.word	0xee6ed178
-	.word	0xf57d4f7f
-	.word	0x72176fba
-	.word	0x6f067aa
-	.word	0xa2c898a6
-	.word	0xa637dc5
-	.word	0xbef90dae
-	.word	0x113f9804
-	.word	0x131c471b
-	.word	0x1b710b35
-	.word	0x23047d84
-	.word	0x28db77f5
-	.word	0x40c72493
-	.word	0x32caab7b
-	.word	0x15c9bebc
-	.word	0x3c9ebe0a
-	.word	0x9c100d4c
-	.word	0x431d67c4
-	.word	0xcb3e42b6
-	.word	0x4cc5d4be
-	.word	0xfc657e2a
-	.word	0x597f299c
-	.word	0x3ad6faec
-	.word	0x5fcb6fab
-	.word	0x4a475817
-	.word	0x6c44198c
+	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
+	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+	.quad	0x3956c25bf348b538,0x59f111f1b605d019
+	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
+	.quad	0xd807aa98a3030242,0x12835b0145706fbe
+	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
+	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
+	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
+	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
+	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
+	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
+	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
+	.quad	0x06ca6351e003826f,0x142929670a0e6e70
+	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
+	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
+	.quad	0x81c2c92e47edaee6,0x92722c851482353b
+	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
+	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
+	.quad	0xd192e819d6ef5218,0xd69906245565a910
+	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
+	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
+	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
+	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
+	.quad	0x90befffa23631e28,0xa4506cebde82bde9
+	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
+	.quad	0xca273eceea26619c,0xd186b8c721c0c207
+	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
+	.quad	0x113f9804bef90dae,0x1b710b35131c471b
+	.quad	0x28db77f523047d84,0x32caab7b40c72493
+	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
 	.text
 	.align	4
 	.globl	Transform_Sha512_Len_base
@@ -3670,6 +3559,6 @@ L_SHA512_transform_len_start:
 #endif /* WOLFSSL_ARMASM */
 
 #if defined(__linux__) && defined(__ELF__)
-.section        .note.GNU-stack,"",%progbits
+.section	.note.GNU-stack,"",%progbits
 #endif
 #endif /* !WOLFSSL_ARMASM_INLINE */
diff --git a/wolfcrypt/src/port/arm/thumb2-sha512-asm_c.c b/wolfcrypt/src/port/arm/thumb2-sha512-asm_c.c
index 531c7a02de5..4f87445f522 100644
--- a/wolfcrypt/src/port/arm/thumb2-sha512-asm_c.c
+++ b/wolfcrypt/src/port/arm/thumb2-sha512-asm_c.c
@@ -41,12 +41,17 @@
 #define __asm__        __asm
 #define __volatile__   volatile
 #endif /* __KEIL__ */
+#ifdef __ghs__
+#define __asm__        __asm
+#define __volatile__
+#define WOLFSSL_NO_VAR_ASSIGN_REG
+#endif /* __ghs__ */
 
 #if defined(WOLFSSL_SHA512) || defined(WOLFSSL_SHA384)
 #include <wolfssl/wolfcrypt/sha512.h>
 
 #ifdef WOLFSSL_ARMASM_NO_NEON
-static const word64 L_SHA512_transform_len_k[] = {
+XALIGNED(16) static const word64 L_SHA512_transform_len_k[] = {
     0x428a2f98d728ae22UL, 0x7137449123ef65cdUL,
     0xb5c0fbcfec4d3b2fUL, 0xe9b5dba58189dbbcUL,
     0x3956c25bf348b538UL, 0x59f111f1b605d019UL,
@@ -89,7 +94,8 @@ static const word64 L_SHA512_transform_len_k[] = {
     0x5fcb6fab3ad6faecUL, 0x6c44198c4a475817UL,
 };
 
-void Transform_Sha512_Len_base(wc_Sha512* sha512, const byte* data, word32 len);
+void Transform_Sha512_Len_base(wc_Sha512* sha512_p, const byte* data_p,
+    word32 len_p);
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
 WC_OMIT_FRAME_POINTER void Transform_Sha512_Len_base(wc_Sha512* sha512_p,
     const byte* data_p, word32 len_p)
@@ -104,11 +110,9 @@ WC_OMIT_FRAME_POINTER void Transform_Sha512_Len_base(wc_Sha512* sha512,
     register word32 len __asm__ ("r2") = (word32)len_p;
     register word64* L_SHA512_transform_len_k_c __asm__ ("r3") =
         (word64*)&L_SHA512_transform_len_k;
-
 #else
     register word64* L_SHA512_transform_len_k_c =
         (word64*)&L_SHA512_transform_len_k;
-
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -3582,9 +3586,15 @@ WC_OMIT_FRAME_POINTER void Transform_Sha512_Len_base(wc_Sha512* sha512,
 #endif
         "EOR	r0, r0, r0\n\t"
         "ADD	sp, sp, #0xc0\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [sha512] "+r" (sha512), [data] "+r" (data), [len] "+r" (len),
           [L_SHA512_transform_len_k] "+r" (L_SHA512_transform_len_k_c)
         :
+#else
+        :
+        : [sha512] "r" (sha512), [data] "r" (data), [len] "r" (len),
+          [L_SHA512_transform_len_k] "r" (L_SHA512_transform_len_k_c)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
             "r12"
     );
diff --git a/wolfcrypt/src/port/ppc32/ppc32-sha256-asm.S b/wolfcrypt/src/port/ppc32/ppc32-sha256-asm.S
index 9ead9dd790f..66e9de8713e 100644
--- a/wolfcrypt/src/port/ppc32/ppc32-sha256-asm.S
+++ b/wolfcrypt/src/port/ppc32/ppc32-sha256-asm.S
@@ -31,76 +31,36 @@
 	.machine ppc
 #ifndef NO_SHA256
 #ifdef WOLFSSL_PPC32_ASM_SPE
-	.section	".text"
-	.section	.rodata
-	.type	L_SHA256_transform_spe_len_k, @object
+#ifndef __APPLE__
+	.text
+	.type	L_SHA256_transform_spe_len_k, %object
 	.size	L_SHA256_transform_spe_len_k, 256
-	.align	4
+#else
+	.section	__DATA,__data
+#endif /* __APPLE__ */
+	# 8-byte aligned, 64-bit aligned
+#ifndef __APPLE__
+	.align	3
+#else
+	.p2align	3
+#endif /* __APPLE__ */
 L_SHA256_transform_spe_len_k:
-	.long	0x428a2f98
-	.long	0x71374491
-	.long	0xb5c0fbcf
-	.long	0xe9b5dba5
-	.long	0x3956c25b
-	.long	0x59f111f1
-	.long	0x923f82a4
-	.long	0xab1c5ed5
-	.long	0xd807aa98
-	.long	0x12835b01
-	.long	0x243185be
-	.long	0x550c7dc3
-	.long	0x72be5d74
-	.long	0x80deb1fe
-	.long	0x9bdc06a7
-	.long	0xc19bf174
-	.long	0xe49b69c1
-	.long	0xefbe4786
-	.long	0xfc19dc6
-	.long	0x240ca1cc
-	.long	0x2de92c6f
-	.long	0x4a7484aa
-	.long	0x5cb0a9dc
-	.long	0x76f988da
-	.long	0x983e5152
-	.long	0xa831c66d
-	.long	0xb00327c8
-	.long	0xbf597fc7
-	.long	0xc6e00bf3
-	.long	0xd5a79147
-	.long	0x6ca6351
-	.long	0x14292967
-	.long	0x27b70a85
-	.long	0x2e1b2138
-	.long	0x4d2c6dfc
-	.long	0x53380d13
-	.long	0x650a7354
-	.long	0x766a0abb
-	.long	0x81c2c92e
-	.long	0x92722c85
-	.long	0xa2bfe8a1
-	.long	0xa81a664b
-	.long	0xc24b8b70
-	.long	0xc76c51a3
-	.long	0xd192e819
-	.long	0xd6990624
-	.long	0xf40e3585
-	.long	0x106aa070
-	.long	0x19a4c116
-	.long	0x1e376c08
-	.long	0x2748774c
-	.long	0x34b0bcb5
-	.long	0x391c0cb3
-	.long	0x4ed8aa4a
-	.long	0x5b9cca4f
-	.long	0x682e6ff3
-	.long	0x748f82ee
-	.long	0x78a5636f
-	.long	0x84c87814
-	.long	0x8cc70208
-	.long	0x90befffa
-	.long	0xa4506ceb
-	.long	0xbef9a3f7
-	.long	0xc67178f2
+	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 	.section	".text"
 	.align	4
 	.globl	Transform_Sha256_Len
@@ -1217,76 +1177,36 @@ L_SHA256_transform_spe_len_start:
 	.size	Transform_Sha256_Len,.-Transform_Sha256_Len
 #endif /* WOLFSSL_PPC32_ASM_SPE */
 #ifndef WOLFSSL_PPC32_ASM_SPE
-	.section	".text"
-	.section	.rodata
-	.type	L_SHA256_transform_len_k, @object
+#ifndef __APPLE__
+	.text
+	.type	L_SHA256_transform_len_k, %object
 	.size	L_SHA256_transform_len_k, 256
-	.align	4
+#else
+	.section	__DATA,__data
+#endif /* __APPLE__ */
+	# 8-byte aligned, 64-bit aligned
+#ifndef __APPLE__
+	.align	3
+#else
+	.p2align	3
+#endif /* __APPLE__ */
 L_SHA256_transform_len_k:
-	.long	0x428a2f98
-	.long	0x71374491
-	.long	0xb5c0fbcf
-	.long	0xe9b5dba5
-	.long	0x3956c25b
-	.long	0x59f111f1
-	.long	0x923f82a4
-	.long	0xab1c5ed5
-	.long	0xd807aa98
-	.long	0x12835b01
-	.long	0x243185be
-	.long	0x550c7dc3
-	.long	0x72be5d74
-	.long	0x80deb1fe
-	.long	0x9bdc06a7
-	.long	0xc19bf174
-	.long	0xe49b69c1
-	.long	0xefbe4786
-	.long	0xfc19dc6
-	.long	0x240ca1cc
-	.long	0x2de92c6f
-	.long	0x4a7484aa
-	.long	0x5cb0a9dc
-	.long	0x76f988da
-	.long	0x983e5152
-	.long	0xa831c66d
-	.long	0xb00327c8
-	.long	0xbf597fc7
-	.long	0xc6e00bf3
-	.long	0xd5a79147
-	.long	0x6ca6351
-	.long	0x14292967
-	.long	0x27b70a85
-	.long	0x2e1b2138
-	.long	0x4d2c6dfc
-	.long	0x53380d13
-	.long	0x650a7354
-	.long	0x766a0abb
-	.long	0x81c2c92e
-	.long	0x92722c85
-	.long	0xa2bfe8a1
-	.long	0xa81a664b
-	.long	0xc24b8b70
-	.long	0xc76c51a3
-	.long	0xd192e819
-	.long	0xd6990624
-	.long	0xf40e3585
-	.long	0x106aa070
-	.long	0x19a4c116
-	.long	0x1e376c08
-	.long	0x2748774c
-	.long	0x34b0bcb5
-	.long	0x391c0cb3
-	.long	0x4ed8aa4a
-	.long	0x5b9cca4f
-	.long	0x682e6ff3
-	.long	0x748f82ee
-	.long	0x78a5636f
-	.long	0x84c87814
-	.long	0x8cc70208
-	.long	0x90befffa
-	.long	0xa4506ceb
-	.long	0xbef9a3f7
-	.long	0xc67178f2
+	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 #ifndef __PIC__
 	.section	".text"
 	.align	4
diff --git a/wolfcrypt/src/port/ppc32/ppc32-sha256-asm_c.c b/wolfcrypt/src/port/ppc32/ppc32-sha256-asm_c.c
index 5a005004e6b..9630846caab 100644
--- a/wolfcrypt/src/port/ppc32/ppc32-sha256-asm_c.c
+++ b/wolfcrypt/src/port/ppc32/ppc32-sha256-asm_c.c
@@ -29,8 +29,6 @@
 #include <wolfssl/wolfcrypt/error-crypt.h>
 
 #ifdef WOLFSSL_PPC32_ASM
-#include <stdint.h>
-#include <wolfssl/wolfcrypt/libwolfssl_sources.h>
 #ifdef WOLFSSL_PPC32_ASM_INLINE
 
 #ifdef __IAR_SYSTEMS_ICC__
@@ -47,11 +45,12 @@
 #define __volatile__
 #define WOLFSSL_NO_VAR_ASSIGN_REG
 #endif /* __ghs__ */
+
 #ifndef NO_SHA256
 #include <wolfssl/wolfcrypt/sha256.h>
 
 #ifdef WOLFSSL_PPC32_ASM_SPE
-static const word32 L_SHA256_transform_spe_len_k[] = {
+XALIGNED(8) static const word32 L_SHA256_transform_spe_len_k[] = {
     0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
     0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
     0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
@@ -73,21 +72,22 @@ static const word32 L_SHA256_transform_spe_len_k[] = {
 void Transform_Sha256_Len(wc_Sha256* sha256_p, const byte* data_p,
     word32 len_p);
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-void Transform_Sha256_Len(wc_Sha256* sha256_p, const byte* data_p, word32 len_p)
+WC_OMIT_FRAME_POINTER void Transform_Sha256_Len(wc_Sha256* sha256_p,
+    const byte* data_p, word32 len_p)
 #else
-void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+WC_OMIT_FRAME_POINTER void Transform_Sha256_Len(wc_Sha256* sha256,
+    const byte* data, word32 len)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register wc_Sha256* sha256 asm ("3") = (wc_Sha256*)sha256_p;
-    register const byte* data asm ("4") = (const byte*)data_p;
-    register word32 len asm ("5") = (word32)len_p;
-    register word32* L_SHA256_transform_spe_len_k_c asm ("6") =
+    register wc_Sha256* sha256 __asm__ ("3") = (wc_Sha256*)sha256_p;
+    register const byte* data __asm__ ("4") = (const byte*)data_p;
+    register word32 len __asm__ ("5") = (word32)len_p;
+    register word32* L_SHA256_transform_spe_len_k_c __asm__ ("6") =
         (word32*)&L_SHA256_transform_spe_len_k;
 #else
     register word32* L_SHA256_transform_spe_len_k_c =
         (word32*)&L_SHA256_transform_spe_len_k;
-
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -104,7 +104,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "lwz     21, 28(%[sha256])\n\t"
         /* Start of loop processing a block */
         "\n"
-    "L_SHA256_transform_spe_len_begin_%=: \n\t"
+    "L_SHA256_transform_spe_len_begin_%=:\n\t"
         /* Load W */
         "lwz     22, 0(%[data])\n\t"
         "lwz     0, 4(%[data])\n\t"
@@ -134,7 +134,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "mtctr   0\n\t"
         /* Start of 16 rounds */
         "\n"
-    "L_SHA256_transform_spe_len_start_%=: \n\t"
+    "L_SHA256_transform_spe_len_start_%=:\n\t"
         /* Round 0 */
         "mr      9, 22\n\t"
         "rotlwi  6, 18, 26\n\t"
@@ -1178,7 +1178,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
 #ifndef WOLFSSL_PPC32_ASM_SPE
 #include <wolfssl/wolfcrypt/sha256.h>
 
-static const word32 L_SHA256_transform_len_k[] = {
+XALIGNED(8) static const word32 L_SHA256_transform_len_k[] = {
     0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
     0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
     0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
@@ -1201,21 +1201,22 @@ static const word32 L_SHA256_transform_len_k[] = {
 void Transform_Sha256_Len(wc_Sha256* sha256_p, const byte* data_p,
     word32 len_p);
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-void Transform_Sha256_Len(wc_Sha256* sha256_p, const byte* data_p, word32 len_p)
+WC_OMIT_FRAME_POINTER void Transform_Sha256_Len(wc_Sha256* sha256_p,
+    const byte* data_p, word32 len_p)
 #else
-void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+WC_OMIT_FRAME_POINTER void Transform_Sha256_Len(wc_Sha256* sha256,
+    const byte* data, word32 len)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register wc_Sha256* sha256 asm ("3") = (wc_Sha256*)sha256_p;
-    register const byte* data asm ("4") = (const byte*)data_p;
-    register word32 len asm ("5") = (word32)len_p;
-    register word32* L_SHA256_transform_len_k_c asm ("6") =
+    register wc_Sha256* sha256 __asm__ ("3") = (wc_Sha256*)sha256_p;
+    register const byte* data __asm__ ("4") = (const byte*)data_p;
+    register word32 len __asm__ ("5") = (word32)len_p;
+    register word32* L_SHA256_transform_len_k_c __asm__ ("6") =
         (word32*)&L_SHA256_transform_len_k;
 #else
     register word32* L_SHA256_transform_len_k_c =
         (word32*)&L_SHA256_transform_len_k;
-
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -1234,7 +1235,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "mtctr   %[len]\n\t"
         /* Start of loop processing a block */
         "\n"
-    "L_SHA256_transform_len_begin_%=: \n\t"
+    "L_SHA256_transform_len_begin_%=:\n\t"
         /* Load W - 64 bytes */
         "lwz     16, 0(%[data])\n\t"
         "lwz     17, 4(%[data])\n\t"
@@ -3625,7 +3626,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "stw     %[len], 0(1)\n\t"
         /* Start of loop processing a block */
         "\n"
-    "L_SHA256_transform_len_begin_%=: \n\t"
+    "L_SHA256_transform_len_begin_%=:\n\t"
         /* Load W - 64 bytes */
         "lwz     16, 0(%[data])\n\t"
         "lwz     17, 4(%[data])\n\t"
@@ -3647,7 +3648,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "mtctr   0\n\t"
         /* Start of 16 rounds */
         "\n"
-    "L_SHA256_transform_len_start_%=: \n\t"
+    "L_SHA256_transform_len_start_%=:\n\t"
         /* Round 0 */
         "rotlwi  0, 11, 26\n\t"
         "rotlwi  %[len], 11, 21\n\t"
@@ -3692,7 +3693,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "add     16, 16, 0\n\t"
         "add     16, 16, 25\n\t"
         "\n"
-    "L_SHA256_transform_len_after_blk_0_%=: \n\t"
+    "L_SHA256_transform_len_after_blk_0_%=:\n\t"
         /* Round 1 */
         "rotlwi  0, 10, 26\n\t"
         "rotlwi  %[len], 10, 21\n\t"
@@ -3737,7 +3738,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "add     17, 17, 0\n\t"
         "add     17, 17, 26\n\t"
         "\n"
-    "L_SHA256_transform_len_after_blk_1_%=: \n\t"
+    "L_SHA256_transform_len_after_blk_1_%=:\n\t"
         /* Round 2 */
         "rotlwi  0, 9, 26\n\t"
         "rotlwi  %[len], 9, 21\n\t"
@@ -3782,7 +3783,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "add     18, 18, 0\n\t"
         "add     18, 18, 27\n\t"
         "\n"
-    "L_SHA256_transform_len_after_blk_2_%=: \n\t"
+    "L_SHA256_transform_len_after_blk_2_%=:\n\t"
         /* Round 3 */
         "rotlwi  0, 8, 26\n\t"
         "rotlwi  %[len], 8, 21\n\t"
@@ -3827,7 +3828,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "add     19, 19, 0\n\t"
         "add     19, 19, 28\n\t"
         "\n"
-    "L_SHA256_transform_len_after_blk_3_%=: \n\t"
+    "L_SHA256_transform_len_after_blk_3_%=:\n\t"
         /* Round 4 */
         "rotlwi  0, 7, 26\n\t"
         "rotlwi  %[len], 7, 21\n\t"
@@ -3872,7 +3873,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "add     20, 20, 0\n\t"
         "add     20, 20, 29\n\t"
         "\n"
-    "L_SHA256_transform_len_after_blk_4_%=: \n\t"
+    "L_SHA256_transform_len_after_blk_4_%=:\n\t"
         /* Round 5 */
         "rotlwi  0, 15, 26\n\t"
         "rotlwi  %[len], 15, 21\n\t"
@@ -3917,7 +3918,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "add     21, 21, 0\n\t"
         "add     21, 21, 30\n\t"
         "\n"
-    "L_SHA256_transform_len_after_blk_5_%=: \n\t"
+    "L_SHA256_transform_len_after_blk_5_%=:\n\t"
         /* Round 6 */
         "rotlwi  0, 14, 26\n\t"
         "rotlwi  %[len], 14, 21\n\t"
@@ -3962,7 +3963,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "add     22, 22, 0\n\t"
         "add     22, 22, 31\n\t"
         "\n"
-    "L_SHA256_transform_len_after_blk_6_%=: \n\t"
+    "L_SHA256_transform_len_after_blk_6_%=:\n\t"
         /* Round 7 */
         "rotlwi  0, 12, 26\n\t"
         "rotlwi  %[len], 12, 21\n\t"
@@ -4007,7 +4008,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "add     23, 23, 0\n\t"
         "add     23, 23, 16\n\t"
         "\n"
-    "L_SHA256_transform_len_after_blk_7_%=: \n\t"
+    "L_SHA256_transform_len_after_blk_7_%=:\n\t"
         /* Round 8 */
         "rotlwi  0, 11, 26\n\t"
         "rotlwi  %[len], 11, 21\n\t"
@@ -4052,7 +4053,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "add     24, 24, 0\n\t"
         "add     24, 24, 17\n\t"
         "\n"
-    "L_SHA256_transform_len_after_blk_8_%=: \n\t"
+    "L_SHA256_transform_len_after_blk_8_%=:\n\t"
         /* Round 9 */
         "rotlwi  0, 10, 26\n\t"
         "rotlwi  %[len], 10, 21\n\t"
@@ -4097,7 +4098,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "add     25, 25, 0\n\t"
         "add     25, 25, 18\n\t"
         "\n"
-    "L_SHA256_transform_len_after_blk_9_%=: \n\t"
+    "L_SHA256_transform_len_after_blk_9_%=:\n\t"
         /* Round 10 */
         "rotlwi  0, 9, 26\n\t"
         "rotlwi  %[len], 9, 21\n\t"
@@ -4142,7 +4143,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "add     26, 26, 0\n\t"
         "add     26, 26, 19\n\t"
         "\n"
-    "L_SHA256_transform_len_after_blk_10_%=: \n\t"
+    "L_SHA256_transform_len_after_blk_10_%=:\n\t"
         /* Round 11 */
         "rotlwi  0, 8, 26\n\t"
         "rotlwi  %[len], 8, 21\n\t"
@@ -4187,7 +4188,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "add     27, 27, 0\n\t"
         "add     27, 27, 20\n\t"
         "\n"
-    "L_SHA256_transform_len_after_blk_11_%=: \n\t"
+    "L_SHA256_transform_len_after_blk_11_%=:\n\t"
         /* Round 12 */
         "rotlwi  0, 7, 26\n\t"
         "rotlwi  %[len], 7, 21\n\t"
@@ -4232,7 +4233,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "add     28, 28, 0\n\t"
         "add     28, 28, 21\n\t"
         "\n"
-    "L_SHA256_transform_len_after_blk_12_%=: \n\t"
+    "L_SHA256_transform_len_after_blk_12_%=:\n\t"
         /* Round 13 */
         "rotlwi  0, 15, 26\n\t"
         "rotlwi  %[len], 15, 21\n\t"
@@ -4277,7 +4278,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "add     29, 29, 0\n\t"
         "add     29, 29, 22\n\t"
         "\n"
-    "L_SHA256_transform_len_after_blk_13_%=: \n\t"
+    "L_SHA256_transform_len_after_blk_13_%=:\n\t"
         /* Round 14 */
         "rotlwi  0, 14, 26\n\t"
         "rotlwi  %[len], 14, 21\n\t"
@@ -4322,7 +4323,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "add     30, 30, 0\n\t"
         "add     30, 30, 23\n\t"
         "\n"
-    "L_SHA256_transform_len_after_blk_14_%=: \n\t"
+    "L_SHA256_transform_len_after_blk_14_%=:\n\t"
         /* Round 15 */
         "rotlwi  0, 12, 26\n\t"
         "rotlwi  %[len], 12, 21\n\t"
@@ -4367,7 +4368,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "add     31, 31, 0\n\t"
         "add     31, 31, 24\n\t"
         "\n"
-    "L_SHA256_transform_len_after_blk_15_%=: \n\t"
+    "L_SHA256_transform_len_after_blk_15_%=:\n\t"
         "addi    6, 6, 0x40\n\t"
         "bdnz    L_SHA256_transform_len_start_%=\n\t"
         "subi    6, 6, 0x100\n\t"
@@ -4424,21 +4425,22 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
 void Transform_Sha256_Len(wc_Sha256* sha256_p, const byte* data_p,
     word32 len_p);
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-void Transform_Sha256_Len(wc_Sha256* sha256_p, const byte* data_p, word32 len_p)
+WC_OMIT_FRAME_POINTER void Transform_Sha256_Len(wc_Sha256* sha256_p,
+    const byte* data_p, word32 len_p)
 #else
-void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+WC_OMIT_FRAME_POINTER void Transform_Sha256_Len(wc_Sha256* sha256,
+    const byte* data, word32 len)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register wc_Sha256* sha256 asm ("3") = (wc_Sha256*)sha256_p;
-    register const byte* data asm ("4") = (const byte*)data_p;
-    register word32 len asm ("5") = (word32)len_p;
-    register word32* L_SHA256_transform_len_k_c asm ("6") =
+    register wc_Sha256* sha256 __asm__ ("3") = (wc_Sha256*)sha256_p;
+    register const byte* data __asm__ ("4") = (const byte*)data_p;
+    register word32 len __asm__ ("5") = (word32)len_p;
+    register word32* L_SHA256_transform_len_k_c __asm__ ("6") =
         (word32*)&L_SHA256_transform_len_k;
 #else
     register word32* L_SHA256_transform_len_k_c =
         (word32*)&L_SHA256_transform_len_k;
-
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -4461,7 +4463,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "lwz     %[sha256], 4(1)\n\t"
         /* Start of loop processing a block */
         "\n"
-    "L_SHA256_transform_len_begin_%=: \n\t"
+    "L_SHA256_transform_len_begin_%=:\n\t"
         /* Load W - 64 bytes */
         "lwz     14, 0(%[sha256])\n\t"
         "lwz     15, 4(%[sha256])\n\t"
@@ -6868,7 +6870,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "lwz     %[sha256], 4(1)\n\t"
         /* Start of loop processing a block */
         "\n"
-    "L_SHA256_transform_len_begin_%=: \n\t"
+    "L_SHA256_transform_len_begin_%=:\n\t"
         /* Load W - 64 bytes */
         "lwz     14, 0(%[sha256])\n\t"
         "lwz     15, 4(%[sha256])\n\t"
@@ -6890,7 +6892,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "mtctr   %[sha256]\n\t"
         /* Start of 16 rounds */
         "\n"
-    "L_SHA256_transform_len_start_%=: \n\t"
+    "L_SHA256_transform_len_start_%=:\n\t"
         /* Round 0 */
         "rotlwi  %[sha256], 9, 26\n\t"
         "rotlwi  %[len], 9, 21\n\t"
@@ -6935,7 +6937,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "add     14, 14, %[sha256]\n\t"
         "add     14, 14, 23\n\t"
         "\n"
-    "L_SHA256_transform_len_after_blk_0_%=: \n\t"
+    "L_SHA256_transform_len_after_blk_0_%=:\n\t"
         /* Round 1 */
         "rotlwi  %[sha256], 8, 26\n\t"
         "rotlwi  %[len], 8, 21\n\t"
@@ -6980,7 +6982,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "add     15, 15, %[sha256]\n\t"
         "add     15, 15, 24\n\t"
         "\n"
-    "L_SHA256_transform_len_after_blk_1_%=: \n\t"
+    "L_SHA256_transform_len_after_blk_1_%=:\n\t"
         /* Round 2 */
         "rotlwi  %[sha256], 7, 26\n\t"
         "rotlwi  %[len], 7, 21\n\t"
@@ -7025,7 +7027,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "add     16, 16, %[sha256]\n\t"
         "add     16, 16, 25\n\t"
         "\n"
-    "L_SHA256_transform_len_after_blk_2_%=: \n\t"
+    "L_SHA256_transform_len_after_blk_2_%=:\n\t"
         /* Round 3 */
         "rotlwi  %[sha256], %[data], 26\n\t"
         "rotlwi  %[len], %[data], 21\n\t"
@@ -7070,7 +7072,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "add     17, 17, %[sha256]\n\t"
         "add     17, 17, 26\n\t"
         "\n"
-    "L_SHA256_transform_len_after_blk_3_%=: \n\t"
+    "L_SHA256_transform_len_after_blk_3_%=:\n\t"
         /* Round 4 */
         "rotlwi  %[sha256], 0, 26\n\t"
         "rotlwi  %[len], 0, 21\n\t"
@@ -7115,7 +7117,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "add     18, 18, %[sha256]\n\t"
         "add     18, 18, 27\n\t"
         "\n"
-    "L_SHA256_transform_len_after_blk_4_%=: \n\t"
+    "L_SHA256_transform_len_after_blk_4_%=:\n\t"
         /* Round 5 */
         "rotlwi  %[sha256], 12, 26\n\t"
         "rotlwi  %[len], 12, 21\n\t"
@@ -7160,7 +7162,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "add     19, 19, %[sha256]\n\t"
         "add     19, 19, 28\n\t"
         "\n"
-    "L_SHA256_transform_len_after_blk_5_%=: \n\t"
+    "L_SHA256_transform_len_after_blk_5_%=:\n\t"
         /* Round 6 */
         "rotlwi  %[sha256], 11, 26\n\t"
         "rotlwi  %[len], 11, 21\n\t"
@@ -7205,7 +7207,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "add     20, 20, %[sha256]\n\t"
         "add     20, 20, 29\n\t"
         "\n"
-    "L_SHA256_transform_len_after_blk_6_%=: \n\t"
+    "L_SHA256_transform_len_after_blk_6_%=:\n\t"
         /* Round 7 */
         "rotlwi  %[sha256], 10, 26\n\t"
         "rotlwi  %[len], 10, 21\n\t"
@@ -7250,7 +7252,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "add     21, 21, %[sha256]\n\t"
         "add     21, 21, 14\n\t"
         "\n"
-    "L_SHA256_transform_len_after_blk_7_%=: \n\t"
+    "L_SHA256_transform_len_after_blk_7_%=:\n\t"
         /* Round 8 */
         "rotlwi  %[sha256], 9, 26\n\t"
         "rotlwi  %[len], 9, 21\n\t"
@@ -7295,7 +7297,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "add     22, 22, %[sha256]\n\t"
         "add     22, 22, 15\n\t"
         "\n"
-    "L_SHA256_transform_len_after_blk_8_%=: \n\t"
+    "L_SHA256_transform_len_after_blk_8_%=:\n\t"
         /* Round 9 */
         "rotlwi  %[sha256], 8, 26\n\t"
         "rotlwi  %[len], 8, 21\n\t"
@@ -7340,7 +7342,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "add     23, 23, %[sha256]\n\t"
         "add     23, 23, 16\n\t"
         "\n"
-    "L_SHA256_transform_len_after_blk_9_%=: \n\t"
+    "L_SHA256_transform_len_after_blk_9_%=:\n\t"
         /* Round 10 */
         "rotlwi  %[sha256], 7, 26\n\t"
         "rotlwi  %[len], 7, 21\n\t"
@@ -7385,7 +7387,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "add     24, 24, %[sha256]\n\t"
         "add     24, 24, 17\n\t"
         "\n"
-    "L_SHA256_transform_len_after_blk_10_%=: \n\t"
+    "L_SHA256_transform_len_after_blk_10_%=:\n\t"
         /* Round 11 */
         "rotlwi  %[sha256], %[data], 26\n\t"
         "rotlwi  %[len], %[data], 21\n\t"
@@ -7430,7 +7432,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "add     25, 25, %[sha256]\n\t"
         "add     25, 25, 18\n\t"
         "\n"
-    "L_SHA256_transform_len_after_blk_11_%=: \n\t"
+    "L_SHA256_transform_len_after_blk_11_%=:\n\t"
         /* Round 12 */
         "rotlwi  %[sha256], 0, 26\n\t"
         "rotlwi  %[len], 0, 21\n\t"
@@ -7475,7 +7477,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "add     26, 26, %[sha256]\n\t"
         "add     26, 26, 19\n\t"
         "\n"
-    "L_SHA256_transform_len_after_blk_12_%=: \n\t"
+    "L_SHA256_transform_len_after_blk_12_%=:\n\t"
         /* Round 13 */
         "rotlwi  %[sha256], 12, 26\n\t"
         "rotlwi  %[len], 12, 21\n\t"
@@ -7520,7 +7522,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "add     27, 27, %[sha256]\n\t"
         "add     27, 27, 20\n\t"
         "\n"
-    "L_SHA256_transform_len_after_blk_13_%=: \n\t"
+    "L_SHA256_transform_len_after_blk_13_%=:\n\t"
         /* Round 14 */
         "rotlwi  %[sha256], 11, 26\n\t"
         "rotlwi  %[len], 11, 21\n\t"
@@ -7565,7 +7567,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "add     28, 28, %[sha256]\n\t"
         "add     28, 28, 21\n\t"
         "\n"
-    "L_SHA256_transform_len_after_blk_14_%=: \n\t"
+    "L_SHA256_transform_len_after_blk_14_%=:\n\t"
         /* Round 15 */
         "rotlwi  %[sha256], 10, 26\n\t"
         "rotlwi  %[len], 10, 21\n\t"
@@ -7610,7 +7612,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "add     29, 29, %[sha256]\n\t"
         "add     29, 29, 22\n\t"
         "\n"
-    "L_SHA256_transform_len_after_blk_15_%=: \n\t"
+    "L_SHA256_transform_len_after_blk_15_%=:\n\t"
         "addi    6, 6, 0x40\n\t"
         "bdnz    L_SHA256_transform_len_start_%=\n\t"
         "subi    6, 6, 0x100\n\t"
diff --git a/wolfcrypt/src/port/ppc32/ppc32-sha256-asm_cr.c b/wolfcrypt/src/port/ppc32/ppc32-sha256-asm_cr.c
index 2f8c2b8b129..1a1d42cf338 100644
--- a/wolfcrypt/src/port/ppc32/ppc32-sha256-asm_cr.c
+++ b/wolfcrypt/src/port/ppc32/ppc32-sha256-asm_cr.c
@@ -29,8 +29,6 @@
 #include <wolfssl/wolfcrypt/error-crypt.h>
 
 #ifdef WOLFSSL_PPC32_ASM
-#include <stdint.h>
-#include <wolfssl/wolfcrypt/libwolfssl_sources.h>
 #ifdef WOLFSSL_PPC32_ASM_INLINE
 
 #ifdef __IAR_SYSTEMS_ICC__
@@ -47,11 +45,12 @@
 #define __volatile__
 #define WOLFSSL_NO_VAR_ASSIGN_REG
 #endif /* __ghs__ */
+
 #ifndef NO_SHA256
 #include <wolfssl/wolfcrypt/sha256.h>
 
 #ifdef WOLFSSL_PPC32_ASM_SPE
-static const word32 L_SHA256_transform_spe_len_k[] = {
+XALIGNED(8) static const word32 L_SHA256_transform_spe_len_k[] = {
     0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
     0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
     0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
@@ -73,21 +72,22 @@ static const word32 L_SHA256_transform_spe_len_k[] = {
 void Transform_Sha256_Len(wc_Sha256* sha256_p, const byte* data_p,
     word32 len_p);
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-void Transform_Sha256_Len(wc_Sha256* sha256_p, const byte* data_p, word32 len_p)
+WC_OMIT_FRAME_POINTER void Transform_Sha256_Len(wc_Sha256* sha256_p,
+    const byte* data_p, word32 len_p)
 #else
-void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+WC_OMIT_FRAME_POINTER void Transform_Sha256_Len(wc_Sha256* sha256,
+    const byte* data, word32 len)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register wc_Sha256* sha256 asm ("3") = (wc_Sha256*)sha256_p;
-    register const byte* data asm ("4") = (const byte*)data_p;
-    register word32 len asm ("5") = (word32)len_p;
-    register word32* L_SHA256_transform_spe_len_k_c asm ("6") =
+    register wc_Sha256* sha256 __asm__ ("r3") = (wc_Sha256*)sha256_p;
+    register const byte* data __asm__ ("r4") = (const byte*)data_p;
+    register word32 len __asm__ ("r5") = (word32)len_p;
+    register word32* L_SHA256_transform_spe_len_k_c __asm__ ("r6") =
         (word32*)&L_SHA256_transform_spe_len_k;
 #else
     register word32* L_SHA256_transform_spe_len_k_c =
         (word32*)&L_SHA256_transform_spe_len_k;
-
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -104,7 +104,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "lwz     r21, 28(%[sha256])\n\t"
         /* Start of loop processing a block */
         "\n"
-    "L_SHA256_transform_spe_len_begin_%=: \n\t"
+    "L_SHA256_transform_spe_len_begin_%=:\n\t"
         /* Load W */
         "lwz     r22, 0(%[data])\n\t"
         "lwz     r0, 4(%[data])\n\t"
@@ -134,7 +134,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "mtctr   r0\n\t"
         /* Start of 16 rounds */
         "\n"
-    "L_SHA256_transform_spe_len_start_%=: \n\t"
+    "L_SHA256_transform_spe_len_start_%=:\n\t"
         /* Round 0 */
         "mr      r9, r22\n\t"
         "rotlwi  r6, r18, 26\n\t"
@@ -1178,7 +1178,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
 #ifndef WOLFSSL_PPC32_ASM_SPE
 #include <wolfssl/wolfcrypt/sha256.h>
 
-static const word32 L_SHA256_transform_len_k[] = {
+XALIGNED(8) static const word32 L_SHA256_transform_len_k[] = {
     0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
     0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
     0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
@@ -1201,21 +1201,22 @@ static const word32 L_SHA256_transform_len_k[] = {
 void Transform_Sha256_Len(wc_Sha256* sha256_p, const byte* data_p,
     word32 len_p);
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-void Transform_Sha256_Len(wc_Sha256* sha256_p, const byte* data_p, word32 len_p)
+WC_OMIT_FRAME_POINTER void Transform_Sha256_Len(wc_Sha256* sha256_p,
+    const byte* data_p, word32 len_p)
 #else
-void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+WC_OMIT_FRAME_POINTER void Transform_Sha256_Len(wc_Sha256* sha256,
+    const byte* data, word32 len)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register wc_Sha256* sha256 asm ("3") = (wc_Sha256*)sha256_p;
-    register const byte* data asm ("4") = (const byte*)data_p;
-    register word32 len asm ("5") = (word32)len_p;
-    register word32* L_SHA256_transform_len_k_c asm ("6") =
+    register wc_Sha256* sha256 __asm__ ("r3") = (wc_Sha256*)sha256_p;
+    register const byte* data __asm__ ("r4") = (const byte*)data_p;
+    register word32 len __asm__ ("r5") = (word32)len_p;
+    register word32* L_SHA256_transform_len_k_c __asm__ ("r6") =
         (word32*)&L_SHA256_transform_len_k;
 #else
     register word32* L_SHA256_transform_len_k_c =
         (word32*)&L_SHA256_transform_len_k;
-
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -1234,7 +1235,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "mtctr   %[len]\n\t"
         /* Start of loop processing a block */
         "\n"
-    "L_SHA256_transform_len_begin_%=: \n\t"
+    "L_SHA256_transform_len_begin_%=:\n\t"
         /* Load W - 64 bytes */
         "lwz     r16, 0(%[data])\n\t"
         "lwz     r17, 4(%[data])\n\t"
@@ -3625,7 +3626,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "stw     %[len], 0(r1)\n\t"
         /* Start of loop processing a block */
         "\n"
-    "L_SHA256_transform_len_begin_%=: \n\t"
+    "L_SHA256_transform_len_begin_%=:\n\t"
         /* Load W - 64 bytes */
         "lwz     r16, 0(%[data])\n\t"
         "lwz     r17, 4(%[data])\n\t"
@@ -3647,7 +3648,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "mtctr   r0\n\t"
         /* Start of 16 rounds */
         "\n"
-    "L_SHA256_transform_len_start_%=: \n\t"
+    "L_SHA256_transform_len_start_%=:\n\t"
         /* Round 0 */
         "rotlwi  r0, r11, 26\n\t"
         "rotlwi  %[len], r11, 21\n\t"
@@ -3692,7 +3693,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "add     r16, r16, r0\n\t"
         "add     r16, r16, r25\n\t"
         "\n"
-    "L_SHA256_transform_len_after_blk_0_%=: \n\t"
+    "L_SHA256_transform_len_after_blk_0_%=:\n\t"
         /* Round 1 */
         "rotlwi  r0, r10, 26\n\t"
         "rotlwi  %[len], r10, 21\n\t"
@@ -3737,7 +3738,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "add     r17, r17, r0\n\t"
         "add     r17, r17, r26\n\t"
         "\n"
-    "L_SHA256_transform_len_after_blk_1_%=: \n\t"
+    "L_SHA256_transform_len_after_blk_1_%=:\n\t"
         /* Round 2 */
         "rotlwi  r0, r9, 26\n\t"
         "rotlwi  %[len], r9, 21\n\t"
@@ -3782,7 +3783,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "add     r18, r18, r0\n\t"
         "add     r18, r18, r27\n\t"
         "\n"
-    "L_SHA256_transform_len_after_blk_2_%=: \n\t"
+    "L_SHA256_transform_len_after_blk_2_%=:\n\t"
         /* Round 3 */
         "rotlwi  r0, r8, 26\n\t"
         "rotlwi  %[len], r8, 21\n\t"
@@ -3827,7 +3828,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "add     r19, r19, r0\n\t"
         "add     r19, r19, r28\n\t"
         "\n"
-    "L_SHA256_transform_len_after_blk_3_%=: \n\t"
+    "L_SHA256_transform_len_after_blk_3_%=:\n\t"
         /* Round 4 */
         "rotlwi  r0, r7, 26\n\t"
         "rotlwi  %[len], r7, 21\n\t"
@@ -3872,7 +3873,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "add     r20, r20, r0\n\t"
         "add     r20, r20, r29\n\t"
         "\n"
-    "L_SHA256_transform_len_after_blk_4_%=: \n\t"
+    "L_SHA256_transform_len_after_blk_4_%=:\n\t"
         /* Round 5 */
         "rotlwi  r0, r15, 26\n\t"
         "rotlwi  %[len], r15, 21\n\t"
@@ -3917,7 +3918,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "add     r21, r21, r0\n\t"
         "add     r21, r21, r30\n\t"
         "\n"
-    "L_SHA256_transform_len_after_blk_5_%=: \n\t"
+    "L_SHA256_transform_len_after_blk_5_%=:\n\t"
         /* Round 6 */
         "rotlwi  r0, r14, 26\n\t"
         "rotlwi  %[len], r14, 21\n\t"
@@ -3962,7 +3963,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "add     r22, r22, r0\n\t"
         "add     r22, r22, r31\n\t"
         "\n"
-    "L_SHA256_transform_len_after_blk_6_%=: \n\t"
+    "L_SHA256_transform_len_after_blk_6_%=:\n\t"
         /* Round 7 */
         "rotlwi  r0, r12, 26\n\t"
         "rotlwi  %[len], r12, 21\n\t"
@@ -4007,7 +4008,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "add     r23, r23, r0\n\t"
         "add     r23, r23, r16\n\t"
         "\n"
-    "L_SHA256_transform_len_after_blk_7_%=: \n\t"
+    "L_SHA256_transform_len_after_blk_7_%=:\n\t"
         /* Round 8 */
         "rotlwi  r0, r11, 26\n\t"
         "rotlwi  %[len], r11, 21\n\t"
@@ -4052,7 +4053,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "add     r24, r24, r0\n\t"
         "add     r24, r24, r17\n\t"
         "\n"
-    "L_SHA256_transform_len_after_blk_8_%=: \n\t"
+    "L_SHA256_transform_len_after_blk_8_%=:\n\t"
         /* Round 9 */
         "rotlwi  r0, r10, 26\n\t"
         "rotlwi  %[len], r10, 21\n\t"
@@ -4097,7 +4098,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "add     r25, r25, r0\n\t"
         "add     r25, r25, r18\n\t"
         "\n"
-    "L_SHA256_transform_len_after_blk_9_%=: \n\t"
+    "L_SHA256_transform_len_after_blk_9_%=:\n\t"
         /* Round 10 */
         "rotlwi  r0, r9, 26\n\t"
         "rotlwi  %[len], r9, 21\n\t"
@@ -4142,7 +4143,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "add     r26, r26, r0\n\t"
         "add     r26, r26, r19\n\t"
         "\n"
-    "L_SHA256_transform_len_after_blk_10_%=: \n\t"
+    "L_SHA256_transform_len_after_blk_10_%=:\n\t"
         /* Round 11 */
         "rotlwi  r0, r8, 26\n\t"
         "rotlwi  %[len], r8, 21\n\t"
@@ -4187,7 +4188,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "add     r27, r27, r0\n\t"
         "add     r27, r27, r20\n\t"
         "\n"
-    "L_SHA256_transform_len_after_blk_11_%=: \n\t"
+    "L_SHA256_transform_len_after_blk_11_%=:\n\t"
         /* Round 12 */
         "rotlwi  r0, r7, 26\n\t"
         "rotlwi  %[len], r7, 21\n\t"
@@ -4232,7 +4233,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "add     r28, r28, r0\n\t"
         "add     r28, r28, r21\n\t"
         "\n"
-    "L_SHA256_transform_len_after_blk_12_%=: \n\t"
+    "L_SHA256_transform_len_after_blk_12_%=:\n\t"
         /* Round 13 */
         "rotlwi  r0, r15, 26\n\t"
         "rotlwi  %[len], r15, 21\n\t"
@@ -4277,7 +4278,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "add     r29, r29, r0\n\t"
         "add     r29, r29, r22\n\t"
         "\n"
-    "L_SHA256_transform_len_after_blk_13_%=: \n\t"
+    "L_SHA256_transform_len_after_blk_13_%=:\n\t"
         /* Round 14 */
         "rotlwi  r0, r14, 26\n\t"
         "rotlwi  %[len], r14, 21\n\t"
@@ -4322,7 +4323,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "add     r30, r30, r0\n\t"
         "add     r30, r30, r23\n\t"
         "\n"
-    "L_SHA256_transform_len_after_blk_14_%=: \n\t"
+    "L_SHA256_transform_len_after_blk_14_%=:\n\t"
         /* Round 15 */
         "rotlwi  r0, r12, 26\n\t"
         "rotlwi  %[len], r12, 21\n\t"
@@ -4367,7 +4368,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "add     r31, r31, r0\n\t"
         "add     r31, r31, r24\n\t"
         "\n"
-    "L_SHA256_transform_len_after_blk_15_%=: \n\t"
+    "L_SHA256_transform_len_after_blk_15_%=:\n\t"
         "addi    r6, r6, 0x40\n\t"
         "bdnz    L_SHA256_transform_len_start_%=\n\t"
         "subi    r6, r6, 0x100\n\t"
@@ -4424,21 +4425,22 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
 void Transform_Sha256_Len(wc_Sha256* sha256_p, const byte* data_p,
     word32 len_p);
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-void Transform_Sha256_Len(wc_Sha256* sha256_p, const byte* data_p, word32 len_p)
+WC_OMIT_FRAME_POINTER void Transform_Sha256_Len(wc_Sha256* sha256_p,
+    const byte* data_p, word32 len_p)
 #else
-void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+WC_OMIT_FRAME_POINTER void Transform_Sha256_Len(wc_Sha256* sha256,
+    const byte* data, word32 len)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register wc_Sha256* sha256 asm ("3") = (wc_Sha256*)sha256_p;
-    register const byte* data asm ("4") = (const byte*)data_p;
-    register word32 len asm ("5") = (word32)len_p;
-    register word32* L_SHA256_transform_len_k_c asm ("6") =
+    register wc_Sha256* sha256 __asm__ ("r3") = (wc_Sha256*)sha256_p;
+    register const byte* data __asm__ ("r4") = (const byte*)data_p;
+    register word32 len __asm__ ("r5") = (word32)len_p;
+    register word32* L_SHA256_transform_len_k_c __asm__ ("r6") =
         (word32*)&L_SHA256_transform_len_k;
 #else
     register word32* L_SHA256_transform_len_k_c =
         (word32*)&L_SHA256_transform_len_k;
-
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -4461,7 +4463,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "lwz     %[sha256], 4(r1)\n\t"
         /* Start of loop processing a block */
         "\n"
-    "L_SHA256_transform_len_begin_%=: \n\t"
+    "L_SHA256_transform_len_begin_%=:\n\t"
         /* Load W - 64 bytes */
         "lwz     r14, 0(%[sha256])\n\t"
         "lwz     r15, 4(%[sha256])\n\t"
@@ -6868,7 +6870,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "lwz     %[sha256], 4(r1)\n\t"
         /* Start of loop processing a block */
         "\n"
-    "L_SHA256_transform_len_begin_%=: \n\t"
+    "L_SHA256_transform_len_begin_%=:\n\t"
         /* Load W - 64 bytes */
         "lwz     r14, 0(%[sha256])\n\t"
         "lwz     r15, 4(%[sha256])\n\t"
@@ -6890,7 +6892,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "mtctr   %[sha256]\n\t"
         /* Start of 16 rounds */
         "\n"
-    "L_SHA256_transform_len_start_%=: \n\t"
+    "L_SHA256_transform_len_start_%=:\n\t"
         /* Round 0 */
         "rotlwi  %[sha256], r9, 26\n\t"
         "rotlwi  %[len], r9, 21\n\t"
@@ -6935,7 +6937,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "add     r14, r14, %[sha256]\n\t"
         "add     r14, r14, r23\n\t"
         "\n"
-    "L_SHA256_transform_len_after_blk_0_%=: \n\t"
+    "L_SHA256_transform_len_after_blk_0_%=:\n\t"
         /* Round 1 */
         "rotlwi  %[sha256], r8, 26\n\t"
         "rotlwi  %[len], r8, 21\n\t"
@@ -6980,7 +6982,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "add     r15, r15, %[sha256]\n\t"
         "add     r15, r15, r24\n\t"
         "\n"
-    "L_SHA256_transform_len_after_blk_1_%=: \n\t"
+    "L_SHA256_transform_len_after_blk_1_%=:\n\t"
         /* Round 2 */
         "rotlwi  %[sha256], r7, 26\n\t"
         "rotlwi  %[len], r7, 21\n\t"
@@ -7025,7 +7027,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "add     r16, r16, %[sha256]\n\t"
         "add     r16, r16, r25\n\t"
         "\n"
-    "L_SHA256_transform_len_after_blk_2_%=: \n\t"
+    "L_SHA256_transform_len_after_blk_2_%=:\n\t"
         /* Round 3 */
         "rotlwi  %[sha256], %[data], 26\n\t"
         "rotlwi  %[len], %[data], 21\n\t"
@@ -7070,7 +7072,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "add     r17, r17, %[sha256]\n\t"
         "add     r17, r17, r26\n\t"
         "\n"
-    "L_SHA256_transform_len_after_blk_3_%=: \n\t"
+    "L_SHA256_transform_len_after_blk_3_%=:\n\t"
         /* Round 4 */
         "rotlwi  %[sha256], r0, 26\n\t"
         "rotlwi  %[len], r0, 21\n\t"
@@ -7115,7 +7117,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "add     r18, r18, %[sha256]\n\t"
         "add     r18, r18, r27\n\t"
         "\n"
-    "L_SHA256_transform_len_after_blk_4_%=: \n\t"
+    "L_SHA256_transform_len_after_blk_4_%=:\n\t"
         /* Round 5 */
         "rotlwi  %[sha256], r12, 26\n\t"
         "rotlwi  %[len], r12, 21\n\t"
@@ -7160,7 +7162,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "add     r19, r19, %[sha256]\n\t"
         "add     r19, r19, r28\n\t"
         "\n"
-    "L_SHA256_transform_len_after_blk_5_%=: \n\t"
+    "L_SHA256_transform_len_after_blk_5_%=:\n\t"
         /* Round 6 */
         "rotlwi  %[sha256], r11, 26\n\t"
         "rotlwi  %[len], r11, 21\n\t"
@@ -7205,7 +7207,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "add     r20, r20, %[sha256]\n\t"
         "add     r20, r20, r29\n\t"
         "\n"
-    "L_SHA256_transform_len_after_blk_6_%=: \n\t"
+    "L_SHA256_transform_len_after_blk_6_%=:\n\t"
         /* Round 7 */
         "rotlwi  %[sha256], r10, 26\n\t"
         "rotlwi  %[len], r10, 21\n\t"
@@ -7250,7 +7252,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "add     r21, r21, %[sha256]\n\t"
         "add     r21, r21, r14\n\t"
         "\n"
-    "L_SHA256_transform_len_after_blk_7_%=: \n\t"
+    "L_SHA256_transform_len_after_blk_7_%=:\n\t"
         /* Round 8 */
         "rotlwi  %[sha256], r9, 26\n\t"
         "rotlwi  %[len], r9, 21\n\t"
@@ -7295,7 +7297,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "add     r22, r22, %[sha256]\n\t"
         "add     r22, r22, r15\n\t"
         "\n"
-    "L_SHA256_transform_len_after_blk_8_%=: \n\t"
+    "L_SHA256_transform_len_after_blk_8_%=:\n\t"
         /* Round 9 */
         "rotlwi  %[sha256], r8, 26\n\t"
         "rotlwi  %[len], r8, 21\n\t"
@@ -7340,7 +7342,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "add     r23, r23, %[sha256]\n\t"
         "add     r23, r23, r16\n\t"
         "\n"
-    "L_SHA256_transform_len_after_blk_9_%=: \n\t"
+    "L_SHA256_transform_len_after_blk_9_%=:\n\t"
         /* Round 10 */
         "rotlwi  %[sha256], r7, 26\n\t"
         "rotlwi  %[len], r7, 21\n\t"
@@ -7385,7 +7387,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "add     r24, r24, %[sha256]\n\t"
         "add     r24, r24, r17\n\t"
         "\n"
-    "L_SHA256_transform_len_after_blk_10_%=: \n\t"
+    "L_SHA256_transform_len_after_blk_10_%=:\n\t"
         /* Round 11 */
         "rotlwi  %[sha256], %[data], 26\n\t"
         "rotlwi  %[len], %[data], 21\n\t"
@@ -7430,7 +7432,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "add     r25, r25, %[sha256]\n\t"
         "add     r25, r25, r18\n\t"
         "\n"
-    "L_SHA256_transform_len_after_blk_11_%=: \n\t"
+    "L_SHA256_transform_len_after_blk_11_%=:\n\t"
         /* Round 12 */
         "rotlwi  %[sha256], r0, 26\n\t"
         "rotlwi  %[len], r0, 21\n\t"
@@ -7475,7 +7477,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "add     r26, r26, %[sha256]\n\t"
         "add     r26, r26, r19\n\t"
         "\n"
-    "L_SHA256_transform_len_after_blk_12_%=: \n\t"
+    "L_SHA256_transform_len_after_blk_12_%=:\n\t"
         /* Round 13 */
         "rotlwi  %[sha256], r12, 26\n\t"
         "rotlwi  %[len], r12, 21\n\t"
@@ -7520,7 +7522,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "add     r27, r27, %[sha256]\n\t"
         "add     r27, r27, r20\n\t"
         "\n"
-    "L_SHA256_transform_len_after_blk_13_%=: \n\t"
+    "L_SHA256_transform_len_after_blk_13_%=:\n\t"
         /* Round 14 */
         "rotlwi  %[sha256], r11, 26\n\t"
         "rotlwi  %[len], r11, 21\n\t"
@@ -7565,7 +7567,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "add     r28, r28, %[sha256]\n\t"
         "add     r28, r28, r21\n\t"
         "\n"
-    "L_SHA256_transform_len_after_blk_14_%=: \n\t"
+    "L_SHA256_transform_len_after_blk_14_%=:\n\t"
         /* Round 15 */
         "rotlwi  %[sha256], r10, 26\n\t"
         "rotlwi  %[len], r10, 21\n\t"
@@ -7610,7 +7612,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "add     r29, r29, %[sha256]\n\t"
         "add     r29, r29, r22\n\t"
         "\n"
-    "L_SHA256_transform_len_after_blk_15_%=: \n\t"
+    "L_SHA256_transform_len_after_blk_15_%=:\n\t"
         "addi    r6, r6, 0x40\n\t"
         "bdnz    L_SHA256_transform_len_start_%=\n\t"
         "subi    r6, r6, 0x100\n\t"
diff --git a/wolfcrypt/src/sha256_asm.S b/wolfcrypt/src/sha256_asm.S
index 1f103ad2c7a..a407b7de1f5 100644
--- a/wolfcrypt/src/sha256_asm.S
+++ b/wolfcrypt/src/sha256_asm.S
@@ -53,6 +53,11 @@
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_sse2_sha256_sha_k:
 .long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 .long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
@@ -81,7 +86,7 @@ L_sse2_sha256_sha_k:
 .p2align	4
 #endif /* __APPLE__ */
 L_sse2_sha256_shuf_mask:
-.quad	0x405060700010203, 0xc0d0e0f08090a0b
+.quad	0x0405060700010203,0x0c0d0e0f08090a0b
 #ifndef __APPLE__
 .text
 .globl	Transform_Sha256_SSE2_Sha
@@ -487,6 +492,11 @@ L_sha256_sha_len_sse2_start:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_avx1_sha256_k:
 .long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 .long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
@@ -515,7 +525,7 @@ L_avx1_sha256_k:
 .p2align	4
 #endif /* __APPLE__ */
 L_avx1_sha256_shuf_00BA:
-.quad	0xb0a090803020100, 0xffffffffffffffff
+.quad	0x0b0a090803020100,0xffffffffffffffff
 #ifndef __APPLE__
 .data
 #else
@@ -527,7 +537,7 @@ L_avx1_sha256_shuf_00BA:
 .p2align	4
 #endif /* __APPLE__ */
 L_avx1_sha256_shuf_DC00:
-.quad	0xffffffffffffffff, 0xb0a090803020100
+.quad	0xffffffffffffffff,0x0b0a090803020100
 #ifndef __APPLE__
 .data
 #else
@@ -539,7 +549,7 @@ L_avx1_sha256_shuf_DC00:
 .p2align	4
 #endif /* __APPLE__ */
 L_avx1_sha256_flip_mask:
-.quad	0x405060700010203, 0xc0d0e0f08090a0b
+.quad	0x0405060700010203,0x0c0d0e0f08090a0b
 #ifndef __APPLE__
 .text
 .globl	Transform_Sha256_AVX1
@@ -5342,6 +5352,11 @@ L_sha256_len_avx1_start:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_avx1_rorx_sha256_k:
 .long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 .long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
@@ -5370,7 +5385,7 @@ L_avx1_rorx_sha256_k:
 .p2align	4
 #endif /* __APPLE__ */
 L_avx1_rorx_sha256_shuf_00BA:
-.quad	0xb0a090803020100, 0xffffffffffffffff
+.quad	0x0b0a090803020100,0xffffffffffffffff
 #ifndef __APPLE__
 .data
 #else
@@ -5382,7 +5397,7 @@ L_avx1_rorx_sha256_shuf_00BA:
 .p2align	4
 #endif /* __APPLE__ */
 L_avx1_rorx_sha256_shuf_DC00:
-.quad	0xffffffffffffffff, 0xb0a090803020100
+.quad	0xffffffffffffffff,0x0b0a090803020100
 #ifndef __APPLE__
 .data
 #else
@@ -5394,7 +5409,7 @@ L_avx1_rorx_sha256_shuf_DC00:
 .p2align	4
 #endif /* __APPLE__ */
 L_avx1_rorx_sha256_flip_mask:
-.quad	0x405060700010203, 0xc0d0e0f08090a0b
+.quad	0x0405060700010203,0x0c0d0e0f08090a0b
 #ifndef __APPLE__
 .text
 .globl	Transform_Sha256_AVX1_RORX
@@ -10114,6 +10129,11 @@ L_sha256_len_avx1_len_rorx_start:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_avx1_sha256_sha_k:
 .long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 .long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
@@ -10142,7 +10162,7 @@ L_avx1_sha256_sha_k:
 .p2align	4
 #endif /* __APPLE__ */
 L_avx1_sha256_shuf_mask:
-.quad	0x405060700010203, 0xc0d0e0f08090a0b
+.quad	0x0405060700010203,0x0c0d0e0f08090a0b
 #ifndef __APPLE__
 .text
 .globl	Transform_Sha256_AVX1_Sha
@@ -10493,6 +10513,11 @@ L_sha256_sha_len_avx1_start:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_avx2_sha256_k:
 .long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 .long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
@@ -10537,8 +10562,8 @@ L_avx2_sha256_k:
 .p2align	5
 #endif /* __APPLE__ */
 L_avx2_sha256_shuf_00BA:
-.quad	0xb0a090803020100, 0xffffffffffffffff
-.quad	0xb0a090803020100, 0xffffffffffffffff
+.quad	0x0b0a090803020100,0xffffffffffffffff
+.quad	0x0b0a090803020100,0xffffffffffffffff
 #ifndef __APPLE__
 .data
 #else
@@ -10550,8 +10575,8 @@ L_avx2_sha256_shuf_00BA:
 .p2align	5
 #endif /* __APPLE__ */
 L_avx2_sha256_shuf_DC00:
-.quad	0xffffffffffffffff, 0xb0a090803020100
-.quad	0xffffffffffffffff, 0xb0a090803020100
+.quad	0xffffffffffffffff,0x0b0a090803020100
+.quad	0xffffffffffffffff,0x0b0a090803020100
 #ifndef __APPLE__
 .data
 #else
@@ -10563,8 +10588,8 @@ L_avx2_sha256_shuf_DC00:
 .p2align	5
 #endif /* __APPLE__ */
 L_avx2_sha256_flip_mask:
-.quad	0x405060700010203, 0xc0d0e0f08090a0b
-.quad	0x405060700010203, 0xc0d0e0f08090a0b
+.quad	0x0405060700010203,0x0c0d0e0f08090a0b
+.quad	0x0405060700010203,0x0c0d0e0f08090a0b
 #ifndef __APPLE__
 .text
 .globl	Transform_Sha256_AVX2
@@ -17092,6 +17117,11 @@ L_sha256_len_avx2_done:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_avx2_rorx_sha256_k:
 .long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 .long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
@@ -17136,8 +17166,8 @@ L_avx2_rorx_sha256_k:
 .p2align	5
 #endif /* __APPLE__ */
 L_avx2_rorx_sha256_flip_mask:
-.quad	0x405060700010203, 0xc0d0e0f08090a0b
-.quad	0x405060700010203, 0xc0d0e0f08090a0b
+.quad	0x0405060700010203,0x0c0d0e0f08090a0b
+.quad	0x0405060700010203,0x0c0d0e0f08090a0b
 #ifndef __APPLE__
 .data
 #else
@@ -17149,8 +17179,8 @@ L_avx2_rorx_sha256_flip_mask:
 .p2align	5
 #endif /* __APPLE__ */
 L_avx2_rorx_sha256_shuf_00BA:
-.quad	0xb0a090803020100, 0xffffffffffffffff
-.quad	0xb0a090803020100, 0xffffffffffffffff
+.quad	0x0b0a090803020100,0xffffffffffffffff
+.quad	0x0b0a090803020100,0xffffffffffffffff
 #ifndef __APPLE__
 .data
 #else
@@ -17162,8 +17192,8 @@ L_avx2_rorx_sha256_shuf_00BA:
 .p2align	5
 #endif /* __APPLE__ */
 L_avx2_rorx_sha256_shuf_DC00:
-.quad	0xffffffffffffffff, 0xb0a090803020100
-.quad	0xffffffffffffffff, 0xb0a090803020100
+.quad	0xffffffffffffffff,0x0b0a090803020100
+.quad	0xffffffffffffffff,0x0b0a090803020100
 #ifndef __APPLE__
 .text
 .globl	Transform_Sha256_AVX2_RORX
diff --git a/wolfcrypt/src/sha3_asm.S b/wolfcrypt/src/sha3_asm.S
index 8151a7d61bc..387a42a4c86 100644
--- a/wolfcrypt/src/sha3_asm.S
+++ b/wolfcrypt/src/sha3_asm.S
@@ -53,9 +53,9 @@
 .section	__DATA,__data
 #endif /* __APPLE__ */
 #ifndef __APPLE__
-.align	16
+.align	32
 #else
-.p2align	4
+.p2align	5
 #endif /* __APPLE__ */
 L_sha3_avx2_r:
 .quad	0x0000000000000001,0x0000000000000001
@@ -112,9 +112,9 @@ L_sha3_avx2_r:
 .section	__DATA,__data
 #endif /* __APPLE__ */
 #ifndef __APPLE__
-.align	16
+.align	32
 #else
-.p2align	4
+.p2align	5
 #endif /* __APPLE__ */
 L_sha3_x4_avx2_r:
 .quad	0x0000000000000001,0x0000000000000001
@@ -9314,9 +9314,9 @@ L_sha3_block_n_bmi2_rounds:
 .section	__DATA,__data
 #endif /* __APPLE__ */
 #ifndef __APPLE__
-.align	16
+.align	32
 #else
-.p2align	4
+.p2align	5
 #endif /* __APPLE__ */
 L_sha3_block_avx2_rotl:
 .quad	0x0000000000000001,0x000000000000003e
@@ -9337,9 +9337,9 @@ L_sha3_block_avx2_rotl:
 .section	__DATA,__data
 #endif /* __APPLE__ */
 #ifndef __APPLE__
-.align	16
+.align	32
 #else
-.p2align	4
+.p2align	5
 #endif /* __APPLE__ */
 L_sha3_block_avx2_rotr:
 .quad	0x000000000000003f,0x0000000000000002
@@ -9533,9 +9533,9 @@ L_sha3_block_avx2_start:
 .section	__DATA,__data
 #endif /* __APPLE__ */
 #ifndef __APPLE__
-.align	16
+.align	32
 #else
-.p2align	4
+.p2align	5
 #endif /* __APPLE__ */
 L_sha3_block_n_avx2_rotl:
 .quad	0x0000000000000001,0x000000000000003e
@@ -9556,9 +9556,9 @@ L_sha3_block_n_avx2_rotl:
 .section	__DATA,__data
 #endif /* __APPLE__ */
 #ifndef __APPLE__
-.align	16
+.align	32
 #else
-.p2align	4
+.p2align	5
 #endif /* __APPLE__ */
 L_sha3_block_n_avx2_rotr:
 .quad	0x000000000000003f,0x0000000000000002
@@ -15298,8 +15298,8 @@ _sha3_blocksx4_avx2:
 .p2align	5
 #endif /* __APPLE__ */
 L_sha3_128_blockx4_seed_avx2_end_mark:
-.quad	0x8000000000000000, 0x8000000000000000
-.quad	0x8000000000000000, 0x8000000000000000
+.quad	0x8000000000000000,0x8000000000000000
+.quad	0x8000000000000000,0x8000000000000000
 #ifndef __APPLE__
 .text
 .globl	sha3_128_blocksx4_seed_avx2
@@ -20677,8 +20677,8 @@ _sha3_128_blocksx4_seed_avx2:
 .p2align	5
 #endif /* __APPLE__ */
 L_sha3_256_blockx4_seed_avx2_end_mark:
-.quad	0x8000000000000000, 0x8000000000000000
-.quad	0x8000000000000000, 0x8000000000000000
+.quad	0x8000000000000000,0x8000000000000000
+.quad	0x8000000000000000,0x8000000000000000
 #ifndef __APPLE__
 .text
 .globl	sha3_256_blocksx4_seed_avx2
@@ -26057,8 +26057,8 @@ _sha3_256_blocksx4_seed_avx2:
 .p2align	5
 #endif /* __APPLE__ */
 L_sha3_256_blockx4_seed_64_avx2_end_mark:
-.quad	0x8000000000000000, 0x8000000000000000
-.quad	0x8000000000000000, 0x8000000000000000
+.quad	0x8000000000000000,0x8000000000000000
+.quad	0x8000000000000000,0x8000000000000000
 #ifndef __APPLE__
 .text
 .globl	sha3_256_blocksx4_seed_64_avx2
diff --git a/wolfcrypt/src/sha512_asm.S b/wolfcrypt/src/sha512_asm.S
index 00042f95c33..d0ca1dd4fd4 100644
--- a/wolfcrypt/src/sha512_asm.S
+++ b/wolfcrypt/src/sha512_asm.S
@@ -54,9 +54,9 @@
 .section	__DATA,__data
 #endif /* __APPLE__ */
 #ifndef __APPLE__
-.align	16
+.align	32
 #else
-.p2align	4
+.p2align	5
 #endif /* __APPLE__ */
 L_avx1_sha512_k:
 .quad	0x428a2f98d728ae22,0x7137449123ef65cd
@@ -110,7 +110,7 @@ L_avx1_sha512_k:
 .p2align	4
 #endif /* __APPLE__ */
 L_avx1_sha512_flip_mask:
-.quad	0x1020304050607, 0x8090a0b0c0d0e0f
+.quad	0x0001020304050607,0x08090a0b0c0d0e0f
 #ifndef __APPLE__
 .text
 .globl	Transform_Sha512_AVX1
@@ -2682,9 +2682,9 @@ L_sha512_len_avx1_start:
 .section	__DATA,__data
 #endif /* __APPLE__ */
 #ifndef __APPLE__
-.align	16
+.align	32
 #else
-.p2align	4
+.p2align	5
 #endif /* __APPLE__ */
 L_avx1_rorx_sha512_k:
 .quad	0x428a2f98d728ae22,0x7137449123ef65cd
@@ -2738,7 +2738,7 @@ L_avx1_rorx_sha512_k:
 .p2align	4
 #endif /* __APPLE__ */
 L_avx1_rorx_sha512_flip_mask:
-.quad	0x1020304050607, 0x8090a0b0c0d0e0f
+.quad	0x0001020304050607,0x08090a0b0c0d0e0f
 #ifndef __APPLE__
 .text
 .globl	Transform_Sha512_AVX1_RORX
@@ -5186,9 +5186,9 @@ L_sha512_len_avx1_rorx_start:
 .section	__DATA,__data
 #endif /* __APPLE__ */
 #ifndef __APPLE__
-.align	16
+.align	32
 #else
-.p2align	4
+.p2align	5
 #endif /* __APPLE__ */
 L_avx2_sha512_k:
 .quad	0x428a2f98d728ae22,0x7137449123ef65cd
@@ -5237,9 +5237,9 @@ L_avx2_sha512_k:
 .section	__DATA,__data
 #endif /* __APPLE__ */
 #ifndef __APPLE__
-.align	16
+.align	32
 #else
-.p2align	4
+.p2align	5
 #endif /* __APPLE__ */
 L_avx2_sha512_k_2:
 .quad	0x428a2f98d728ae22,0x7137449123ef65cd
@@ -5345,8 +5345,8 @@ L_avx2_sha512_k_2_end:
 .p2align	5
 #endif /* __APPLE__ */
 L_avx2_sha512_flip_mask:
-.quad	0x1020304050607, 0x8090a0b0c0d0e0f
-.quad	0x1020304050607, 0x8090a0b0c0d0e0f
+.quad	0x0001020304050607,0x08090a0b0c0d0e0f
+.quad	0x0001020304050607,0x08090a0b0c0d0e0f
 #ifndef __APPLE__
 .text
 .globl	Transform_Sha512_AVX2
@@ -6430,11 +6430,11 @@ _Transform_Sha512_AVX2_Len:
         movq	%rsi, %rbp
         testb	$0x80, %bpl
         je	L_sha512_len_avx2_block
-        movq	224(%rdi), %rcx
-        vmovdqu	(%rcx), %ymm0
-        vmovdqu	32(%rcx), %ymm1
-        vmovdqu	64(%rcx), %ymm2
-        vmovdqu	96(%rcx), %ymm3
+        movq	224(%rdi), %rbx
+        vmovdqu	(%rbx), %ymm0
+        vmovdqu	32(%rbx), %ymm1
+        vmovdqu	64(%rbx), %ymm2
+        vmovdqu	96(%rbx), %ymm3
         vmovups	%ymm0, 64(%rdi)
         vmovups	%ymm1, 96(%rdi)
         vmovups	%ymm2, 128(%rdi)
@@ -8043,9 +8043,9 @@ L_sha512_len_avx2_done:
 .section	__DATA,__data
 #endif /* __APPLE__ */
 #ifndef __APPLE__
-.align	16
+.align	32
 #else
-.p2align	4
+.p2align	5
 #endif /* __APPLE__ */
 L_avx2_rorx_sha512_k:
 .quad	0x428a2f98d728ae22,0x7137449123ef65cd
@@ -8094,9 +8094,9 @@ L_avx2_rorx_sha512_k:
 .section	__DATA,__data
 #endif /* __APPLE__ */
 #ifndef __APPLE__
-.align	16
+.align	32
 #else
-.p2align	4
+.p2align	5
 #endif /* __APPLE__ */
 L_avx2_rorx_sha512_k_2:
 .quad	0x428a2f98d728ae22,0x7137449123ef65cd
@@ -8202,8 +8202,8 @@ L_avx2_rorx_sha512_k_2_end:
 .p2align	5
 #endif /* __APPLE__ */
 L_avx2_rorx_sha512_flip_mask:
-.quad	0x1020304050607, 0x8090a0b0c0d0e0f
-.quad	0x1020304050607, 0x8090a0b0c0d0e0f
+.quad	0x0001020304050607,0x08090a0b0c0d0e0f
+.quad	0x0001020304050607,0x08090a0b0c0d0e0f
 #ifndef __APPLE__
 .text
 .globl	Transform_Sha512_AVX2_RORX
diff --git a/wolfcrypt/src/sp_arm32.c b/wolfcrypt/src/sp_arm32.c
index dd05a2578f2..41c7d9ce1eb 100644
--- a/wolfcrypt/src/sp_arm32.c
+++ b/wolfcrypt/src/sp_arm32.c
@@ -308,12 +308,12 @@ WC_OMIT_FRAME_POINTER static void sp_2048_mul_8(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static void sp_2048_mul_8(sp_digit* r, const sp_digit* a,
     const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -2317,12 +2317,12 @@ WC_OMIT_FRAME_POINTER static void sp_2048_mul_8(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static void sp_2048_mul_8(sp_digit* r, const sp_digit* a,
     const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -2684,12 +2684,12 @@ WC_OMIT_FRAME_POINTER static void sp_2048_mul_8(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static void sp_2048_mul_8(sp_digit* r, const sp_digit* a,
     const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -2829,12 +2829,12 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_2048_add_8(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_2048_add_8(sp_digit* r,
     const sp_digit* a, const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -2877,11 +2877,11 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_2048_sub_in_place_16(sp_digit* a_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_2048_sub_in_place_16(sp_digit* a,
     const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* a asm ("r0") = (sp_digit*)a_p;
-    register const sp_digit* b asm ("r1") = (const sp_digit*)b_p;
+    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r1") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -2938,12 +2938,12 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_2048_add_16(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_2048_add_16(sp_digit* r,
     const sp_digit* a, const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -3065,11 +3065,11 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_2048_sub_in_place_32(sp_digit* a_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_2048_sub_in_place_32(sp_digit* a,
     const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* a asm ("r0") = (sp_digit*)a_p;
-    register const sp_digit* b asm ("r1") = (const sp_digit*)b_p;
+    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r1") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -3154,12 +3154,12 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_2048_add_32(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_2048_add_32(sp_digit* r,
     const sp_digit* a, const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -3313,11 +3313,11 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_2048_sub_in_place_64(sp_digit* a_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_2048_sub_in_place_64(sp_digit* a,
     const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* a asm ("r0") = (sp_digit*)a_p;
-    register const sp_digit* b asm ("r1") = (const sp_digit*)b_p;
+    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r1") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -3458,12 +3458,12 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_2048_add_64(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_2048_add_64(sp_digit* r,
     const sp_digit* a, const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -3673,11 +3673,11 @@ WC_OMIT_FRAME_POINTER static void sp_2048_sqr_8(sp_digit* r_p,
     const sp_digit* a_p)
 #else
 WC_OMIT_FRAME_POINTER static void sp_2048_sqr_8(sp_digit* r, const sp_digit* a)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -4896,11 +4896,11 @@ WC_OMIT_FRAME_POINTER static void sp_2048_sqr_8(sp_digit* r_p,
     const sp_digit* a_p)
 #else
 WC_OMIT_FRAME_POINTER static void sp_2048_sqr_8(sp_digit* r, const sp_digit* a)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -5152,11 +5152,11 @@ WC_OMIT_FRAME_POINTER static void sp_2048_sqr_8(sp_digit* r_p,
     const sp_digit* a_p)
 #else
 WC_OMIT_FRAME_POINTER static void sp_2048_sqr_8(sp_digit* r, const sp_digit* a)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -5283,12 +5283,12 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_2048_sub_8(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_2048_sub_8(sp_digit* r,
     const sp_digit* a, const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -5367,12 +5367,12 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_2048_sub_16(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_2048_sub_16(sp_digit* r,
     const sp_digit* a, const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -5465,12 +5465,12 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_2048_sub_32(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_2048_sub_32(sp_digit* r,
     const sp_digit* a, const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -5593,19 +5593,19 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_2048_add_64(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_2048_add_64(sp_digit* r,
     const sp_digit* a, const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
         "mov	r3, #0\n\t"
         "add	r12, %[a], #0x100\n\t"
         "\n"
-    "L_sp_2048_add_64_word_%=: \n\t"
+    "L_sp_2048_add_64_word_%=:\n\t"
         "adds	r3, r3, #-1\n\t"
         "ldm	%[a]!, {r4, r5, r6, r7}\n\t"
         "ldm	%[b]!, {r8, r9, r10, r11}\n\t"
@@ -5645,18 +5645,18 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_2048_sub_in_place_64(sp_digit* a_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_2048_sub_in_place_64(sp_digit* a,
     const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* a asm ("r0") = (sp_digit*)a_p;
-    register const sp_digit* b asm ("r1") = (const sp_digit*)b_p;
+    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r1") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
         "mov	r12, #0\n\t"
         "add	lr, %[a], #0x100\n\t"
         "\n"
-    "L_sp_2048_sub_in_place_64_word_%=: \n\t"
+    "L_sp_2048_sub_in_place_64_word_%=:\n\t"
         "rsbs	r12, r12, #0\n\t"
         "ldm	%[a], {r2, r3, r4, r5}\n\t"
         "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
@@ -5696,12 +5696,12 @@ WC_OMIT_FRAME_POINTER static void sp_2048_mul_64(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static void sp_2048_mul_64(sp_digit* r, const sp_digit* a,
     const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -5714,13 +5714,13 @@ WC_OMIT_FRAME_POINTER static void sp_2048_mul_64(sp_digit* r, const sp_digit* a,
         "mov	r8, #0\n\t"
         "mov	r5, #4\n\t"
         "\n"
-    "L_sp_2048_mul_64_outer_%=: \n\t"
+    "L_sp_2048_mul_64_outer_%=:\n\t"
         "subs	r3, r5, #0xfc\n\t"
         "it	cc\n\t"
         "movcc	r3, #0\n\t"
         "sub	r4, r5, r3\n\t"
         "\n"
-    "L_sp_2048_mul_64_inner_%=: \n\t"
+    "L_sp_2048_mul_64_inner_%=:\n\t"
         "ldr	lr, [%[a], r3]\n\t"
         "ldr	r11, [%[b], r4]\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
@@ -5838,7 +5838,7 @@ WC_OMIT_FRAME_POINTER static void sp_2048_mul_64(sp_digit* r, const sp_digit* a,
         "adc	r8, r8, #0\n\t"
 #endif
         "\n"
-    "L_sp_2048_mul_64_inner_done_%=: \n\t"
+    "L_sp_2048_mul_64_inner_done_%=:\n\t"
         "str	r6, [sp, r5]\n\t"
         "mov	r6, r7\n\t"
         "mov	r7, r8\n\t"
@@ -5880,7 +5880,7 @@ WC_OMIT_FRAME_POINTER static void sp_2048_mul_64(sp_digit* r, const sp_digit* a,
         "add	r5, r5, #4\n\t"
         "str	r7, [sp, r5]\n\t"
         "\n"
-    "L_sp_2048_mul_64_store_%=: \n\t"
+    "L_sp_2048_mul_64_store_%=:\n\t"
         "ldm	sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
         "stm	%[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
         "subs	r5, r5, #32\n\t"
@@ -5907,11 +5907,11 @@ WC_OMIT_FRAME_POINTER static void sp_2048_sqr_64(sp_digit* r_p,
     const sp_digit* a_p)
 #else
 WC_OMIT_FRAME_POINTER static void sp_2048_sqr_64(sp_digit* r, const sp_digit* a)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -5923,13 +5923,13 @@ WC_OMIT_FRAME_POINTER static void sp_2048_sqr_64(sp_digit* r, const sp_digit* a)
         "mov	r8, #0\n\t"
         "mov	r5, #4\n\t"
         "\n"
-    "L_sp_2048_sqr_64_outer_%=: \n\t"
+    "L_sp_2048_sqr_64_outer_%=:\n\t"
         "subs	r3, r5, #0xfc\n\t"
         "it	cc\n\t"
         "movcc	r3, #0\n\t"
         "sub	r4, r5, r3\n\t"
         "\n"
-    "L_sp_2048_sqr_64_inner_%=: \n\t"
+    "L_sp_2048_sqr_64_inner_%=:\n\t"
         "ldr	lr, [%[a], r3]\n\t"
         "ldr	r11, [%[a], r4]\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
@@ -6014,7 +6014,7 @@ WC_OMIT_FRAME_POINTER static void sp_2048_sqr_64(sp_digit* r, const sp_digit* a)
         "adc	r8, r8, #0\n\t"
 #endif
         "\n"
-    "L_sp_2048_sqr_64_inner_done_%=: \n\t"
+    "L_sp_2048_sqr_64_inner_done_%=:\n\t"
         "str	r6, [sp, r5]\n\t"
         "mov	r6, r7\n\t"
         "mov	r7, r8\n\t"
@@ -6050,7 +6050,7 @@ WC_OMIT_FRAME_POINTER static void sp_2048_sqr_64(sp_digit* r, const sp_digit* a)
         "add	r5, r5, #4\n\t"
         "str	r7, [sp, r5]\n\t"
         "\n"
-    "L_sp_2048_sqr_64_store_%=: \n\t"
+    "L_sp_2048_sqr_64_store_%=:\n\t"
         "ldm	sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
         "stm	%[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
         "subs	r5, r5, #32\n\t"
@@ -6099,19 +6099,19 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_2048_add_32(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_2048_add_32(sp_digit* r,
     const sp_digit* a, const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
         "mov	r3, #0\n\t"
         "add	r12, %[a], #0x80\n\t"
         "\n"
-    "L_sp_2048_add_32_word_%=: \n\t"
+    "L_sp_2048_add_32_word_%=:\n\t"
         "adds	r3, r3, #-1\n\t"
         "ldm	%[a]!, {r4, r5, r6, r7}\n\t"
         "ldm	%[b]!, {r8, r9, r10, r11}\n\t"
@@ -6151,18 +6151,18 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_2048_sub_in_place_32(sp_digit* a_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_2048_sub_in_place_32(sp_digit* a,
     const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* a asm ("r0") = (sp_digit*)a_p;
-    register const sp_digit* b asm ("r1") = (const sp_digit*)b_p;
+    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r1") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
         "mov	r12, #0\n\t"
         "add	lr, %[a], #0x80\n\t"
         "\n"
-    "L_sp_2048_sub_in_place_32_word_%=: \n\t"
+    "L_sp_2048_sub_in_place_32_word_%=:\n\t"
         "rsbs	r12, r12, #0\n\t"
         "ldm	%[a], {r2, r3, r4, r5}\n\t"
         "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
@@ -6202,12 +6202,12 @@ WC_OMIT_FRAME_POINTER static void sp_2048_mul_32(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static void sp_2048_mul_32(sp_digit* r, const sp_digit* a,
     const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -6220,13 +6220,13 @@ WC_OMIT_FRAME_POINTER static void sp_2048_mul_32(sp_digit* r, const sp_digit* a,
         "mov	r8, #0\n\t"
         "mov	r5, #4\n\t"
         "\n"
-    "L_sp_2048_mul_32_outer_%=: \n\t"
+    "L_sp_2048_mul_32_outer_%=:\n\t"
         "subs	r3, r5, #0x7c\n\t"
         "it	cc\n\t"
         "movcc	r3, #0\n\t"
         "sub	r4, r5, r3\n\t"
         "\n"
-    "L_sp_2048_mul_32_inner_%=: \n\t"
+    "L_sp_2048_mul_32_inner_%=:\n\t"
         "ldr	lr, [%[a], r3]\n\t"
         "ldr	r11, [%[b], r4]\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
@@ -6344,7 +6344,7 @@ WC_OMIT_FRAME_POINTER static void sp_2048_mul_32(sp_digit* r, const sp_digit* a,
         "adc	r8, r8, #0\n\t"
 #endif
         "\n"
-    "L_sp_2048_mul_32_inner_done_%=: \n\t"
+    "L_sp_2048_mul_32_inner_done_%=:\n\t"
         "str	r6, [sp, r5]\n\t"
         "mov	r6, r7\n\t"
         "mov	r7, r8\n\t"
@@ -6386,7 +6386,7 @@ WC_OMIT_FRAME_POINTER static void sp_2048_mul_32(sp_digit* r, const sp_digit* a,
         "add	r5, r5, #4\n\t"
         "str	r7, [sp, r5]\n\t"
         "\n"
-    "L_sp_2048_mul_32_store_%=: \n\t"
+    "L_sp_2048_mul_32_store_%=:\n\t"
         "ldm	sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
         "stm	%[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
         "subs	r5, r5, #32\n\t"
@@ -6413,11 +6413,11 @@ WC_OMIT_FRAME_POINTER static void sp_2048_sqr_32(sp_digit* r_p,
     const sp_digit* a_p)
 #else
 WC_OMIT_FRAME_POINTER static void sp_2048_sqr_32(sp_digit* r, const sp_digit* a)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -6429,13 +6429,13 @@ WC_OMIT_FRAME_POINTER static void sp_2048_sqr_32(sp_digit* r, const sp_digit* a)
         "mov	r8, #0\n\t"
         "mov	r5, #4\n\t"
         "\n"
-    "L_sp_2048_sqr_32_outer_%=: \n\t"
+    "L_sp_2048_sqr_32_outer_%=:\n\t"
         "subs	r3, r5, #0x7c\n\t"
         "it	cc\n\t"
         "movcc	r3, #0\n\t"
         "sub	r4, r5, r3\n\t"
         "\n"
-    "L_sp_2048_sqr_32_inner_%=: \n\t"
+    "L_sp_2048_sqr_32_inner_%=:\n\t"
         "ldr	lr, [%[a], r3]\n\t"
         "ldr	r11, [%[a], r4]\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
@@ -6520,7 +6520,7 @@ WC_OMIT_FRAME_POINTER static void sp_2048_sqr_32(sp_digit* r, const sp_digit* a)
         "adc	r8, r8, #0\n\t"
 #endif
         "\n"
-    "L_sp_2048_sqr_32_inner_done_%=: \n\t"
+    "L_sp_2048_sqr_32_inner_done_%=:\n\t"
         "str	r6, [sp, r5]\n\t"
         "mov	r6, r7\n\t"
         "mov	r7, r8\n\t"
@@ -6556,7 +6556,7 @@ WC_OMIT_FRAME_POINTER static void sp_2048_sqr_32(sp_digit* r, const sp_digit* a)
         "add	r5, r5, #4\n\t"
         "str	r7, [sp, r5]\n\t"
         "\n"
-    "L_sp_2048_sqr_32_store_%=: \n\t"
+    "L_sp_2048_sqr_32_store_%=:\n\t"
         "ldm	sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
         "stm	%[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
         "subs	r5, r5, #32\n\t"
@@ -6609,12 +6609,12 @@ WC_OMIT_FRAME_POINTER static void sp_2048_mul_d_64(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static void sp_2048_mul_d_64(sp_digit* r,
     const sp_digit* a, sp_digit b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register sp_digit b asm ("r2") = (sp_digit)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register sp_digit b __asm__ ("r2") = (sp_digit)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -6650,7 +6650,7 @@ WC_OMIT_FRAME_POINTER static void sp_2048_mul_d_64(sp_digit* r,
         "mov	r5, #0\n\t"
         "mov	r9, #4\n\t"
         "\n"
-    "L_sp_2048_mul_d_64_word_%=: \n\t"
+    "L_sp_2048_mul_d_64_word_%=:\n\t"
         /* A[i] * B */
         "ldr	r8, [%[a], r9]\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
@@ -6720,12 +6720,12 @@ WC_OMIT_FRAME_POINTER static void sp_2048_mul_d_64(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static void sp_2048_mul_d_64(sp_digit* r,
     const sp_digit* a, sp_digit b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register sp_digit b asm ("r2") = (sp_digit)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register sp_digit b __asm__ ("r2") = (sp_digit)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -8816,13 +8816,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_2048_cond_sub_32(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_2048_cond_sub_32(sp_digit* r,
     const sp_digit* a, const sp_digit* b, sp_digit m)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
-    register sp_digit m asm ("r3") = (sp_digit)m_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
+    register sp_digit m __asm__ ("r3") = (sp_digit)m_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -8830,7 +8830,7 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_2048_cond_sub_32(sp_digit* r,
         "mov	r12, #0\n\t"
         "mov	lr, #0\n\t"
         "\n"
-    "L_sp_2048_cond_sub_32_words_%=: \n\t"
+    "L_sp_2048_cond_sub_32_words_%=:\n\t"
         "subs	r12, r6, r12\n\t"
         "ldr	r4, [%[a], lr]\n\t"
         "ldr	r5, [%[b], lr]\n\t"
@@ -8869,13 +8869,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_2048_cond_sub_32(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_2048_cond_sub_32(sp_digit* r,
     const sp_digit* a, const sp_digit* b, sp_digit m)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
-    register sp_digit m asm ("r3") = (sp_digit)m_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
+    register sp_digit m __asm__ ("r3") = (sp_digit)m_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -9019,12 +9019,12 @@ WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_2048_mont_reduce_32(
 #else
 WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_2048_mont_reduce_32(
     sp_digit* a, const sp_digit* m, sp_digit mp)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* a asm ("r0") = (sp_digit*)a_p;
-    register const sp_digit* m asm ("r1") = (const sp_digit*)m_p;
-    register sp_digit mp asm ("r2") = (sp_digit)mp_p;
+    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
+    register const sp_digit* m __asm__ ("r1") = (const sp_digit*)m_p;
+    register sp_digit mp __asm__ ("r2") = (sp_digit)mp_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -9037,7 +9037,7 @@ WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_2048_mont_reduce_32(
         "ldr	r12, [%[a]]\n\t"
         "ldr	lr, [%[a], #4]\n\t"
         "\n"
-    "L_sp_2048_mont_reduce_32_word_%=: \n\t"
+    "L_sp_2048_mont_reduce_32_word_%=:\n\t"
         /* mu = a[i] * mp */
         "mul	r8, %[mp], r12\n\t"
         /* a[i+0] += m[0] * mu */
@@ -10013,12 +10013,12 @@ WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_2048_mont_reduce_32(
 #else
 WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_2048_mont_reduce_32(
     sp_digit* a, const sp_digit* m, sp_digit mp)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* a asm ("r0") = (sp_digit*)a_p;
-    register const sp_digit* m asm ("r1") = (const sp_digit*)m_p;
-    register sp_digit mp asm ("r2") = (sp_digit)mp_p;
+    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
+    register const sp_digit* m __asm__ ("r1") = (const sp_digit*)m_p;
+    register sp_digit mp __asm__ ("r2") = (sp_digit)mp_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -10029,7 +10029,7 @@ WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_2048_mont_reduce_32(
         "ldr	r12, [%[a]]\n\t"
         "ldr	lr, [%[a], #4]\n\t"
         "\n"
-    "L_sp_2048_mont_reduce_32_word_%=: \n\t"
+    "L_sp_2048_mont_reduce_32_word_%=:\n\t"
         /* mu = a[i] * mp */
         "mul	r8, %[mp], r12\n\t"
         /* a[i+0] += m[0] * mu */
@@ -10322,12 +10322,12 @@ WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_2048_mont_reduce_32(
 #else
 WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_2048_mont_reduce_32(
     sp_digit* a, const sp_digit* m, sp_digit mp)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* a asm ("r0") = (sp_digit*)a_p;
-    register const sp_digit* m asm ("r1") = (const sp_digit*)m_p;
-    register sp_digit mp asm ("r2") = (sp_digit)mp_p;
+    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
+    register const sp_digit* m __asm__ ("r1") = (const sp_digit*)m_p;
+    register sp_digit mp __asm__ ("r2") = (sp_digit)mp_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -10340,7 +10340,7 @@ WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_2048_mont_reduce_32(
         "ldr	r7, [%[a], #12]\n\t"
         "ldr	r8, [%[a], #16]\n\t"
         "\n"
-    "L_sp_2048_mont_reduce_32_word_%=: \n\t"
+    "L_sp_2048_mont_reduce_32_word_%=:\n\t"
         /* mu = a[i] * mp */
         "mul	r11, %[mp], r4\n\t"
         /* a[i+0] += m[0] * mu */
@@ -10572,12 +10572,12 @@ WC_OMIT_FRAME_POINTER static void sp_2048_mul_d_32(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static void sp_2048_mul_d_32(sp_digit* r,
     const sp_digit* a, sp_digit b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register sp_digit b asm ("r2") = (sp_digit)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register sp_digit b __asm__ ("r2") = (sp_digit)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -10613,7 +10613,7 @@ WC_OMIT_FRAME_POINTER static void sp_2048_mul_d_32(sp_digit* r,
         "mov	r5, #0\n\t"
         "mov	r9, #4\n\t"
         "\n"
-    "L_sp_2048_mul_d_32_word_%=: \n\t"
+    "L_sp_2048_mul_d_32_word_%=:\n\t"
         /* A[i] * B */
         "ldr	r8, [%[a], r9]\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
@@ -10683,12 +10683,12 @@ WC_OMIT_FRAME_POINTER static void sp_2048_mul_d_32(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static void sp_2048_mul_d_32(sp_digit* r,
     const sp_digit* a, sp_digit b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register sp_digit b asm ("r2") = (sp_digit)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register sp_digit b __asm__ ("r2") = (sp_digit)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -11741,12 +11741,12 @@ WC_OMIT_FRAME_POINTER static sp_digit div_2048_word_32(sp_digit d1_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit div_2048_word_32(sp_digit d1, sp_digit d0,
     sp_digit div)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit d1 asm ("r0") = (sp_digit)d1_p;
-    register sp_digit d0 asm ("r1") = (sp_digit)d0_p;
-    register sp_digit div asm ("r2") = (sp_digit)div_p;
+    register sp_digit d1 __asm__ ("r0") = (sp_digit)d1_p;
+    register sp_digit d0 __asm__ ("r1") = (sp_digit)d0_p;
+    register sp_digit div __asm__ ("r2") = (sp_digit)div_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -11813,12 +11813,12 @@ WC_OMIT_FRAME_POINTER static sp_digit div_2048_word_32(sp_digit d1_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit div_2048_word_32(sp_digit d1, sp_digit d0,
     sp_digit div)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit d1 asm ("r0") = (sp_digit)d1_p;
-    register sp_digit d0 asm ("r1") = (sp_digit)d0_p;
-    register sp_digit div asm ("r2") = (sp_digit)div_p;
+    register sp_digit d1 __asm__ ("r0") = (sp_digit)d1_p;
+    register sp_digit d0 __asm__ ("r1") = (sp_digit)d0_p;
+    register sp_digit div __asm__ ("r2") = (sp_digit)div_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -11836,7 +11836,7 @@ WC_OMIT_FRAME_POINTER static sp_digit div_2048_word_32(sp_digit d1, sp_digit d0,
         /* Next 30 bits */
         "mov	r12, #29\n\t"
         "\n"
-    "L_div_2048_word_32_bit_%=: \n\t"
+    "L_div_2048_word_32_bit_%=:\n\t"
         "lsls	r4, r4, #1\n\t"
         "adc	r5, r5, r5\n\t"
         "subs	r6, lr, r5\n\t"
@@ -11962,11 +11962,11 @@ WC_OMIT_FRAME_POINTER static sp_int32 sp_2048_cmp_32(const sp_digit* a_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_int32 sp_2048_cmp_32(const sp_digit* a,
     const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register const sp_digit* a asm ("r0") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r1") = (const sp_digit*)b_p;
+    register const sp_digit* a __asm__ ("r0") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r1") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -11977,7 +11977,7 @@ WC_OMIT_FRAME_POINTER static sp_int32 sp_2048_cmp_32(const sp_digit* a,
 #ifdef WOLFSSL_SP_SMALL
         "mov	r4, #0x7c\n\t"
         "\n"
-    "L_sp_2048_cmp_32_words_%=: \n\t"
+    "L_sp_2048_cmp_32_words_%=:\n\t"
         "ldr	r12, [%[a], r4]\n\t"
         "ldr	lr, [%[b], r4]\n\t"
         "and	r12, r12, r3\n\t"
@@ -12742,13 +12742,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_2048_cond_sub_64(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_2048_cond_sub_64(sp_digit* r,
     const sp_digit* a, const sp_digit* b, sp_digit m)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
-    register sp_digit m asm ("r3") = (sp_digit)m_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
+    register sp_digit m __asm__ ("r3") = (sp_digit)m_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -12756,7 +12756,7 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_2048_cond_sub_64(sp_digit* r,
         "mov	r12, #0\n\t"
         "mov	lr, #0\n\t"
         "\n"
-    "L_sp_2048_cond_sub_64_words_%=: \n\t"
+    "L_sp_2048_cond_sub_64_words_%=:\n\t"
         "subs	r12, r6, r12\n\t"
         "ldr	r4, [%[a], lr]\n\t"
         "ldr	r5, [%[b], lr]\n\t"
@@ -12795,13 +12795,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_2048_cond_sub_64(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_2048_cond_sub_64(sp_digit* r,
     const sp_digit* a, const sp_digit* b, sp_digit m)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
-    register sp_digit m asm ("r3") = (sp_digit)m_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
+    register sp_digit m __asm__ ("r3") = (sp_digit)m_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -13057,12 +13057,12 @@ WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_2048_mont_reduce_64(
 #else
 WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_2048_mont_reduce_64(
     sp_digit* a, const sp_digit* m, sp_digit mp)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* a asm ("r0") = (sp_digit*)a_p;
-    register const sp_digit* m asm ("r1") = (const sp_digit*)m_p;
-    register sp_digit mp asm ("r2") = (sp_digit)mp_p;
+    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
+    register const sp_digit* m __asm__ ("r1") = (const sp_digit*)m_p;
+    register sp_digit mp __asm__ ("r2") = (sp_digit)mp_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -13075,7 +13075,7 @@ WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_2048_mont_reduce_64(
         "ldr	r12, [%[a]]\n\t"
         "ldr	lr, [%[a], #4]\n\t"
         "\n"
-    "L_sp_2048_mont_reduce_64_word_%=: \n\t"
+    "L_sp_2048_mont_reduce_64_word_%=:\n\t"
         /* mu = a[i] * mp */
         "mul	r8, %[mp], r12\n\t"
         /* a[i+0] += m[0] * mu */
@@ -14979,12 +14979,12 @@ WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_2048_mont_reduce_64(
 #else
 WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_2048_mont_reduce_64(
     sp_digit* a, const sp_digit* m, sp_digit mp)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* a asm ("r0") = (sp_digit*)a_p;
-    register const sp_digit* m asm ("r1") = (const sp_digit*)m_p;
-    register sp_digit mp asm ("r2") = (sp_digit)mp_p;
+    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
+    register const sp_digit* m __asm__ ("r1") = (const sp_digit*)m_p;
+    register sp_digit mp __asm__ ("r2") = (sp_digit)mp_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -14995,7 +14995,7 @@ WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_2048_mont_reduce_64(
         "ldr	r12, [%[a]]\n\t"
         "ldr	lr, [%[a], #4]\n\t"
         "\n"
-    "L_sp_2048_mont_reduce_64_word_%=: \n\t"
+    "L_sp_2048_mont_reduce_64_word_%=:\n\t"
         /* mu = a[i] * mp */
         "mul	r8, %[mp], r12\n\t"
         /* a[i+0] += m[0] * mu */
@@ -15544,12 +15544,12 @@ WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_2048_mont_reduce_64(
 #else
 WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_2048_mont_reduce_64(
     sp_digit* a, const sp_digit* m, sp_digit mp)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* a asm ("r0") = (sp_digit*)a_p;
-    register const sp_digit* m asm ("r1") = (const sp_digit*)m_p;
-    register sp_digit mp asm ("r2") = (sp_digit)mp_p;
+    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
+    register const sp_digit* m __asm__ ("r1") = (const sp_digit*)m_p;
+    register sp_digit mp __asm__ ("r2") = (sp_digit)mp_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -15562,7 +15562,7 @@ WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_2048_mont_reduce_64(
         "ldr	r7, [%[a], #12]\n\t"
         "ldr	r8, [%[a], #16]\n\t"
         "\n"
-    "L_sp_2048_mont_reduce_64_word_%=: \n\t"
+    "L_sp_2048_mont_reduce_64_word_%=:\n\t"
         /* mu = a[i] * mp */
         "mul	r11, %[mp], r4\n\t"
         /* a[i+0] += m[0] * mu */
@@ -15954,19 +15954,19 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_2048_sub_64(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_2048_sub_64(sp_digit* r,
     const sp_digit* a, const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
         "mov	r12, #0\n\t"
         "add	lr, %[a], #0x100\n\t"
         "\n"
-    "L_sp_2048_sub_64_word_%=: \n\t"
+    "L_sp_2048_sub_64_word_%=:\n\t"
         "rsbs	r12, r12, #0\n\t"
         "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
         "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
@@ -16005,12 +16005,12 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_2048_sub_64(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_2048_sub_64(sp_digit* r,
     const sp_digit* a, const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -16156,12 +16156,12 @@ WC_OMIT_FRAME_POINTER static sp_digit div_2048_word_64(sp_digit d1_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit div_2048_word_64(sp_digit d1, sp_digit d0,
     sp_digit div)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit d1 asm ("r0") = (sp_digit)d1_p;
-    register sp_digit d0 asm ("r1") = (sp_digit)d0_p;
-    register sp_digit div asm ("r2") = (sp_digit)div_p;
+    register sp_digit d1 __asm__ ("r0") = (sp_digit)d1_p;
+    register sp_digit d0 __asm__ ("r1") = (sp_digit)d0_p;
+    register sp_digit div __asm__ ("r2") = (sp_digit)div_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -16228,12 +16228,12 @@ WC_OMIT_FRAME_POINTER static sp_digit div_2048_word_64(sp_digit d1_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit div_2048_word_64(sp_digit d1, sp_digit d0,
     sp_digit div)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit d1 asm ("r0") = (sp_digit)d1_p;
-    register sp_digit d0 asm ("r1") = (sp_digit)d0_p;
-    register sp_digit div asm ("r2") = (sp_digit)div_p;
+    register sp_digit d1 __asm__ ("r0") = (sp_digit)d1_p;
+    register sp_digit d0 __asm__ ("r1") = (sp_digit)d0_p;
+    register sp_digit div __asm__ ("r2") = (sp_digit)div_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -16251,7 +16251,7 @@ WC_OMIT_FRAME_POINTER static sp_digit div_2048_word_64(sp_digit d1, sp_digit d0,
         /* Next 30 bits */
         "mov	r12, #29\n\t"
         "\n"
-    "L_div_2048_word_64_bit_%=: \n\t"
+    "L_div_2048_word_64_bit_%=:\n\t"
         "lsls	r4, r4, #1\n\t"
         "adc	r5, r5, r5\n\t"
         "subs	r6, lr, r5\n\t"
@@ -16481,11 +16481,11 @@ WC_OMIT_FRAME_POINTER static sp_int32 sp_2048_cmp_64(const sp_digit* a_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_int32 sp_2048_cmp_64(const sp_digit* a,
     const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register const sp_digit* a asm ("r0") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r1") = (const sp_digit*)b_p;
+    register const sp_digit* a __asm__ ("r0") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r1") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -16496,7 +16496,7 @@ WC_OMIT_FRAME_POINTER static sp_int32 sp_2048_cmp_64(const sp_digit* a,
 #ifdef WOLFSSL_SP_SMALL
         "mov	r4, #0xfc\n\t"
         "\n"
-    "L_sp_2048_cmp_64_words_%=: \n\t"
+    "L_sp_2048_cmp_64_words_%=:\n\t"
         "ldr	r12, [%[a], r4]\n\t"
         "ldr	lr, [%[b], r4]\n\t"
         "and	r12, r12, r3\n\t"
@@ -17722,13 +17722,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_2048_cond_add_32(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_2048_cond_add_32(sp_digit* r,
     const sp_digit* a, const sp_digit* b, sp_digit m)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
-    register sp_digit m asm ("r3") = (sp_digit)m_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
+    register sp_digit m __asm__ ("r3") = (sp_digit)m_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -17736,7 +17736,7 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_2048_cond_add_32(sp_digit* r,
         "mov	r6, #0\n\t"
         "mov	r12, #0\n\t"
         "\n"
-    "L_sp_2048_cond_add_32_words_%=: \n\t"
+    "L_sp_2048_cond_add_32_words_%=:\n\t"
         "adds	lr, lr, #-1\n\t"
         "ldr	r4, [%[a], r12]\n\t"
         "ldr	r5, [%[b], r12]\n\t"
@@ -17775,13 +17775,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_2048_cond_add_32(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_2048_cond_add_32(sp_digit* r,
     const sp_digit* a, const sp_digit* b, sp_digit m)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
-    register sp_digit m asm ("r3") = (sp_digit)m_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
+    register sp_digit m __asm__ ("r3") = (sp_digit)m_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -18191,12 +18191,12 @@ WC_OMIT_FRAME_POINTER static void sp_2048_lshift_64(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static void sp_2048_lshift_64(sp_digit* r,
     const sp_digit* a, byte n)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register byte n asm ("r2") = (byte)n_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register byte n __asm__ ("r2") = (byte)n_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -19012,12 +19012,12 @@ WC_OMIT_FRAME_POINTER static void sp_3072_mul_12(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static void sp_3072_mul_12(sp_digit* r, const sp_digit* a,
     const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -24522,12 +24522,12 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_3072_add_12(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_3072_add_12(sp_digit* r,
     const sp_digit* a, const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -24577,11 +24577,11 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_3072_sub_in_place_24(sp_digit* a_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_3072_sub_in_place_24(sp_digit* a,
     const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* a asm ("r0") = (sp_digit*)a_p;
-    register const sp_digit* b asm ("r1") = (const sp_digit*)b_p;
+    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r1") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -24652,12 +24652,12 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_3072_add_24(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_3072_add_24(sp_digit* r,
     const sp_digit* a, const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -24797,11 +24797,11 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_3072_sub_in_place_48(sp_digit* a_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_3072_sub_in_place_48(sp_digit* a,
     const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* a asm ("r0") = (sp_digit*)a_p;
-    register const sp_digit* b asm ("r1") = (const sp_digit*)b_p;
+    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r1") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -24914,12 +24914,12 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_3072_add_48(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_3072_add_48(sp_digit* r,
     const sp_digit* a, const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -25101,11 +25101,11 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_3072_sub_in_place_96(sp_digit* a_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_3072_sub_in_place_96(sp_digit* a,
     const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* a asm ("r0") = (sp_digit*)a_p;
-    register const sp_digit* b asm ("r1") = (const sp_digit*)b_p;
+    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r1") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -25302,12 +25302,12 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_3072_add_96(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_3072_add_96(sp_digit* r,
     const sp_digit* a, const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -25572,11 +25572,11 @@ WC_OMIT_FRAME_POINTER static void sp_3072_sqr_12(sp_digit* r_p,
     const sp_digit* a_p)
 #else
 WC_OMIT_FRAME_POINTER static void sp_3072_sqr_12(sp_digit* r, const sp_digit* a)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -28649,12 +28649,12 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_3072_sub_12(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_3072_sub_12(sp_digit* r,
     const sp_digit* a, const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -28740,12 +28740,12 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_3072_sub_24(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_3072_sub_24(sp_digit* r,
     const sp_digit* a, const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -28852,12 +28852,12 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_3072_sub_48(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_3072_sub_48(sp_digit* r,
     const sp_digit* a, const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -29008,19 +29008,19 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_3072_add_96(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_3072_add_96(sp_digit* r,
     const sp_digit* a, const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
         "mov	r3, #0\n\t"
         "add	r12, %[a], #0x180\n\t"
         "\n"
-    "L_sp_3072_add_96_word_%=: \n\t"
+    "L_sp_3072_add_96_word_%=:\n\t"
         "adds	r3, r3, #-1\n\t"
         "ldm	%[a]!, {r4, r5, r6, r7}\n\t"
         "ldm	%[b]!, {r8, r9, r10, r11}\n\t"
@@ -29060,18 +29060,18 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_3072_sub_in_place_96(sp_digit* a_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_3072_sub_in_place_96(sp_digit* a,
     const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* a asm ("r0") = (sp_digit*)a_p;
-    register const sp_digit* b asm ("r1") = (const sp_digit*)b_p;
+    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r1") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
         "mov	r12, #0\n\t"
         "add	lr, %[a], #0x180\n\t"
         "\n"
-    "L_sp_3072_sub_in_place_96_word_%=: \n\t"
+    "L_sp_3072_sub_in_place_96_word_%=:\n\t"
         "rsbs	r12, r12, #0\n\t"
         "ldm	%[a], {r2, r3, r4, r5}\n\t"
         "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
@@ -29111,12 +29111,12 @@ WC_OMIT_FRAME_POINTER static void sp_3072_mul_96(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static void sp_3072_mul_96(sp_digit* r, const sp_digit* a,
     const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -29129,13 +29129,13 @@ WC_OMIT_FRAME_POINTER static void sp_3072_mul_96(sp_digit* r, const sp_digit* a,
         "mov	r8, #0\n\t"
         "mov	r5, #4\n\t"
         "\n"
-    "L_sp_3072_mul_96_outer_%=: \n\t"
+    "L_sp_3072_mul_96_outer_%=:\n\t"
         "subs	r3, r5, #0x17c\n\t"
         "it	cc\n\t"
         "movcc	r3, #0\n\t"
         "sub	r4, r5, r3\n\t"
         "\n"
-    "L_sp_3072_mul_96_inner_%=: \n\t"
+    "L_sp_3072_mul_96_inner_%=:\n\t"
         "ldr	lr, [%[a], r3]\n\t"
         "ldr	r11, [%[b], r4]\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
@@ -29253,7 +29253,7 @@ WC_OMIT_FRAME_POINTER static void sp_3072_mul_96(sp_digit* r, const sp_digit* a,
         "adc	r8, r8, #0\n\t"
 #endif
         "\n"
-    "L_sp_3072_mul_96_inner_done_%=: \n\t"
+    "L_sp_3072_mul_96_inner_done_%=:\n\t"
         "str	r6, [sp, r5]\n\t"
         "mov	r6, r7\n\t"
         "mov	r7, r8\n\t"
@@ -29295,7 +29295,7 @@ WC_OMIT_FRAME_POINTER static void sp_3072_mul_96(sp_digit* r, const sp_digit* a,
         "add	r5, r5, #4\n\t"
         "str	r7, [sp, r5]\n\t"
         "\n"
-    "L_sp_3072_mul_96_store_%=: \n\t"
+    "L_sp_3072_mul_96_store_%=:\n\t"
         "ldm	sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
         "stm	%[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
         "subs	r5, r5, #32\n\t"
@@ -29322,11 +29322,11 @@ WC_OMIT_FRAME_POINTER static void sp_3072_sqr_96(sp_digit* r_p,
     const sp_digit* a_p)
 #else
 WC_OMIT_FRAME_POINTER static void sp_3072_sqr_96(sp_digit* r, const sp_digit* a)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -29338,13 +29338,13 @@ WC_OMIT_FRAME_POINTER static void sp_3072_sqr_96(sp_digit* r, const sp_digit* a)
         "mov	r8, #0\n\t"
         "mov	r5, #4\n\t"
         "\n"
-    "L_sp_3072_sqr_96_outer_%=: \n\t"
+    "L_sp_3072_sqr_96_outer_%=:\n\t"
         "subs	r3, r5, #0x17c\n\t"
         "it	cc\n\t"
         "movcc	r3, #0\n\t"
         "sub	r4, r5, r3\n\t"
         "\n"
-    "L_sp_3072_sqr_96_inner_%=: \n\t"
+    "L_sp_3072_sqr_96_inner_%=:\n\t"
         "ldr	lr, [%[a], r3]\n\t"
         "ldr	r11, [%[a], r4]\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
@@ -29429,7 +29429,7 @@ WC_OMIT_FRAME_POINTER static void sp_3072_sqr_96(sp_digit* r, const sp_digit* a)
         "adc	r8, r8, #0\n\t"
 #endif
         "\n"
-    "L_sp_3072_sqr_96_inner_done_%=: \n\t"
+    "L_sp_3072_sqr_96_inner_done_%=:\n\t"
         "str	r6, [sp, r5]\n\t"
         "mov	r6, r7\n\t"
         "mov	r7, r8\n\t"
@@ -29465,7 +29465,7 @@ WC_OMIT_FRAME_POINTER static void sp_3072_sqr_96(sp_digit* r, const sp_digit* a)
         "add	r5, r5, #4\n\t"
         "str	r7, [sp, r5]\n\t"
         "\n"
-    "L_sp_3072_sqr_96_store_%=: \n\t"
+    "L_sp_3072_sqr_96_store_%=:\n\t"
         "ldm	sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
         "stm	%[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
         "subs	r5, r5, #32\n\t"
@@ -29514,19 +29514,19 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_3072_add_48(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_3072_add_48(sp_digit* r,
     const sp_digit* a, const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
         "mov	r3, #0\n\t"
         "add	r12, %[a], #0xc0\n\t"
         "\n"
-    "L_sp_3072_add_48_word_%=: \n\t"
+    "L_sp_3072_add_48_word_%=:\n\t"
         "adds	r3, r3, #-1\n\t"
         "ldm	%[a]!, {r4, r5, r6, r7}\n\t"
         "ldm	%[b]!, {r8, r9, r10, r11}\n\t"
@@ -29566,18 +29566,18 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_3072_sub_in_place_48(sp_digit* a_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_3072_sub_in_place_48(sp_digit* a,
     const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* a asm ("r0") = (sp_digit*)a_p;
-    register const sp_digit* b asm ("r1") = (const sp_digit*)b_p;
+    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r1") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
         "mov	r12, #0\n\t"
         "add	lr, %[a], #0xc0\n\t"
         "\n"
-    "L_sp_3072_sub_in_place_48_word_%=: \n\t"
+    "L_sp_3072_sub_in_place_48_word_%=:\n\t"
         "rsbs	r12, r12, #0\n\t"
         "ldm	%[a], {r2, r3, r4, r5}\n\t"
         "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
@@ -29617,12 +29617,12 @@ WC_OMIT_FRAME_POINTER static void sp_3072_mul_48(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static void sp_3072_mul_48(sp_digit* r, const sp_digit* a,
     const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -29635,13 +29635,13 @@ WC_OMIT_FRAME_POINTER static void sp_3072_mul_48(sp_digit* r, const sp_digit* a,
         "mov	r8, #0\n\t"
         "mov	r5, #4\n\t"
         "\n"
-    "L_sp_3072_mul_48_outer_%=: \n\t"
+    "L_sp_3072_mul_48_outer_%=:\n\t"
         "subs	r3, r5, #0xbc\n\t"
         "it	cc\n\t"
         "movcc	r3, #0\n\t"
         "sub	r4, r5, r3\n\t"
         "\n"
-    "L_sp_3072_mul_48_inner_%=: \n\t"
+    "L_sp_3072_mul_48_inner_%=:\n\t"
         "ldr	lr, [%[a], r3]\n\t"
         "ldr	r11, [%[b], r4]\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
@@ -29759,7 +29759,7 @@ WC_OMIT_FRAME_POINTER static void sp_3072_mul_48(sp_digit* r, const sp_digit* a,
         "adc	r8, r8, #0\n\t"
 #endif
         "\n"
-    "L_sp_3072_mul_48_inner_done_%=: \n\t"
+    "L_sp_3072_mul_48_inner_done_%=:\n\t"
         "str	r6, [sp, r5]\n\t"
         "mov	r6, r7\n\t"
         "mov	r7, r8\n\t"
@@ -29801,7 +29801,7 @@ WC_OMIT_FRAME_POINTER static void sp_3072_mul_48(sp_digit* r, const sp_digit* a,
         "add	r5, r5, #4\n\t"
         "str	r7, [sp, r5]\n\t"
         "\n"
-    "L_sp_3072_mul_48_store_%=: \n\t"
+    "L_sp_3072_mul_48_store_%=:\n\t"
         "ldm	sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
         "stm	%[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
         "subs	r5, r5, #32\n\t"
@@ -29828,11 +29828,11 @@ WC_OMIT_FRAME_POINTER static void sp_3072_sqr_48(sp_digit* r_p,
     const sp_digit* a_p)
 #else
 WC_OMIT_FRAME_POINTER static void sp_3072_sqr_48(sp_digit* r, const sp_digit* a)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -29844,13 +29844,13 @@ WC_OMIT_FRAME_POINTER static void sp_3072_sqr_48(sp_digit* r, const sp_digit* a)
         "mov	r8, #0\n\t"
         "mov	r5, #4\n\t"
         "\n"
-    "L_sp_3072_sqr_48_outer_%=: \n\t"
+    "L_sp_3072_sqr_48_outer_%=:\n\t"
         "subs	r3, r5, #0xbc\n\t"
         "it	cc\n\t"
         "movcc	r3, #0\n\t"
         "sub	r4, r5, r3\n\t"
         "\n"
-    "L_sp_3072_sqr_48_inner_%=: \n\t"
+    "L_sp_3072_sqr_48_inner_%=:\n\t"
         "ldr	lr, [%[a], r3]\n\t"
         "ldr	r11, [%[a], r4]\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
@@ -29935,7 +29935,7 @@ WC_OMIT_FRAME_POINTER static void sp_3072_sqr_48(sp_digit* r, const sp_digit* a)
         "adc	r8, r8, #0\n\t"
 #endif
         "\n"
-    "L_sp_3072_sqr_48_inner_done_%=: \n\t"
+    "L_sp_3072_sqr_48_inner_done_%=:\n\t"
         "str	r6, [sp, r5]\n\t"
         "mov	r6, r7\n\t"
         "mov	r7, r8\n\t"
@@ -29971,7 +29971,7 @@ WC_OMIT_FRAME_POINTER static void sp_3072_sqr_48(sp_digit* r, const sp_digit* a)
         "add	r5, r5, #4\n\t"
         "str	r7, [sp, r5]\n\t"
         "\n"
-    "L_sp_3072_sqr_48_store_%=: \n\t"
+    "L_sp_3072_sqr_48_store_%=:\n\t"
         "ldm	sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
         "stm	%[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
         "subs	r5, r5, #32\n\t"
@@ -30024,12 +30024,12 @@ WC_OMIT_FRAME_POINTER static void sp_3072_mul_d_96(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static void sp_3072_mul_d_96(sp_digit* r,
     const sp_digit* a, sp_digit b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register sp_digit b asm ("r2") = (sp_digit)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register sp_digit b __asm__ ("r2") = (sp_digit)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -30065,7 +30065,7 @@ WC_OMIT_FRAME_POINTER static void sp_3072_mul_d_96(sp_digit* r,
         "mov	r5, #0\n\t"
         "mov	r9, #4\n\t"
         "\n"
-    "L_sp_3072_mul_d_96_word_%=: \n\t"
+    "L_sp_3072_mul_d_96_word_%=:\n\t"
         /* A[i] * B */
         "ldr	r8, [%[a], r9]\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
@@ -30135,12 +30135,12 @@ WC_OMIT_FRAME_POINTER static void sp_3072_mul_d_96(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static void sp_3072_mul_d_96(sp_digit* r,
     const sp_digit* a, sp_digit b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register sp_digit b asm ("r2") = (sp_digit)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register sp_digit b __asm__ ("r2") = (sp_digit)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -33255,13 +33255,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_3072_cond_sub_48(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_3072_cond_sub_48(sp_digit* r,
     const sp_digit* a, const sp_digit* b, sp_digit m)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
-    register sp_digit m asm ("r3") = (sp_digit)m_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
+    register sp_digit m __asm__ ("r3") = (sp_digit)m_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -33269,7 +33269,7 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_3072_cond_sub_48(sp_digit* r,
         "mov	r12, #0\n\t"
         "mov	lr, #0\n\t"
         "\n"
-    "L_sp_3072_cond_sub_48_words_%=: \n\t"
+    "L_sp_3072_cond_sub_48_words_%=:\n\t"
         "subs	r12, r6, r12\n\t"
         "ldr	r4, [%[a], lr]\n\t"
         "ldr	r5, [%[b], lr]\n\t"
@@ -33308,13 +33308,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_3072_cond_sub_48(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_3072_cond_sub_48(sp_digit* r,
     const sp_digit* a, const sp_digit* b, sp_digit m)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
-    register sp_digit m asm ("r3") = (sp_digit)m_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
+    register sp_digit m __asm__ ("r3") = (sp_digit)m_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -33514,12 +33514,12 @@ WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_3072_mont_reduce_48(
 #else
 WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_3072_mont_reduce_48(
     sp_digit* a, const sp_digit* m, sp_digit mp)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* a asm ("r0") = (sp_digit*)a_p;
-    register const sp_digit* m asm ("r1") = (const sp_digit*)m_p;
-    register sp_digit mp asm ("r2") = (sp_digit)mp_p;
+    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
+    register const sp_digit* m __asm__ ("r1") = (const sp_digit*)m_p;
+    register sp_digit mp __asm__ ("r2") = (sp_digit)mp_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -33532,7 +33532,7 @@ WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_3072_mont_reduce_48(
         "ldr	r12, [%[a]]\n\t"
         "ldr	lr, [%[a], #4]\n\t"
         "\n"
-    "L_sp_3072_mont_reduce_48_word_%=: \n\t"
+    "L_sp_3072_mont_reduce_48_word_%=:\n\t"
         /* mu = a[i] * mp */
         "mul	r8, %[mp], r12\n\t"
         /* a[i+0] += m[0] * mu */
@@ -34972,12 +34972,12 @@ WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_3072_mont_reduce_48(
 #else
 WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_3072_mont_reduce_48(
     sp_digit* a, const sp_digit* m, sp_digit mp)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* a asm ("r0") = (sp_digit*)a_p;
-    register const sp_digit* m asm ("r1") = (const sp_digit*)m_p;
-    register sp_digit mp asm ("r2") = (sp_digit)mp_p;
+    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
+    register const sp_digit* m __asm__ ("r1") = (const sp_digit*)m_p;
+    register sp_digit mp __asm__ ("r2") = (sp_digit)mp_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -34988,7 +34988,7 @@ WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_3072_mont_reduce_48(
         "ldr	r12, [%[a]]\n\t"
         "ldr	lr, [%[a], #4]\n\t"
         "\n"
-    "L_sp_3072_mont_reduce_48_word_%=: \n\t"
+    "L_sp_3072_mont_reduce_48_word_%=:\n\t"
         /* mu = a[i] * mp */
         "mul	r8, %[mp], r12\n\t"
         /* a[i+0] += m[0] * mu */
@@ -35409,12 +35409,12 @@ WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_3072_mont_reduce_48(
 #else
 WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_3072_mont_reduce_48(
     sp_digit* a, const sp_digit* m, sp_digit mp)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* a asm ("r0") = (sp_digit*)a_p;
-    register const sp_digit* m asm ("r1") = (const sp_digit*)m_p;
-    register sp_digit mp asm ("r2") = (sp_digit)mp_p;
+    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
+    register const sp_digit* m __asm__ ("r1") = (const sp_digit*)m_p;
+    register sp_digit mp __asm__ ("r2") = (sp_digit)mp_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -35427,7 +35427,7 @@ WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_3072_mont_reduce_48(
         "ldr	r7, [%[a], #12]\n\t"
         "ldr	r8, [%[a], #16]\n\t"
         "\n"
-    "L_sp_3072_mont_reduce_48_word_%=: \n\t"
+    "L_sp_3072_mont_reduce_48_word_%=:\n\t"
         /* mu = a[i] * mp */
         "mul	r11, %[mp], r4\n\t"
         /* a[i+0] += m[0] * mu */
@@ -35739,12 +35739,12 @@ WC_OMIT_FRAME_POINTER static void sp_3072_mul_d_48(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static void sp_3072_mul_d_48(sp_digit* r,
     const sp_digit* a, sp_digit b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register sp_digit b asm ("r2") = (sp_digit)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register sp_digit b __asm__ ("r2") = (sp_digit)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -35780,7 +35780,7 @@ WC_OMIT_FRAME_POINTER static void sp_3072_mul_d_48(sp_digit* r,
         "mov	r5, #0\n\t"
         "mov	r9, #4\n\t"
         "\n"
-    "L_sp_3072_mul_d_48_word_%=: \n\t"
+    "L_sp_3072_mul_d_48_word_%=:\n\t"
         /* A[i] * B */
         "ldr	r8, [%[a], r9]\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
@@ -35850,12 +35850,12 @@ WC_OMIT_FRAME_POINTER static void sp_3072_mul_d_48(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static void sp_3072_mul_d_48(sp_digit* r,
     const sp_digit* a, sp_digit b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register sp_digit b asm ("r2") = (sp_digit)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register sp_digit b __asm__ ("r2") = (sp_digit)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -37420,12 +37420,12 @@ WC_OMIT_FRAME_POINTER static sp_digit div_3072_word_48(sp_digit d1_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit div_3072_word_48(sp_digit d1, sp_digit d0,
     sp_digit div)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit d1 asm ("r0") = (sp_digit)d1_p;
-    register sp_digit d0 asm ("r1") = (sp_digit)d0_p;
-    register sp_digit div asm ("r2") = (sp_digit)div_p;
+    register sp_digit d1 __asm__ ("r0") = (sp_digit)d1_p;
+    register sp_digit d0 __asm__ ("r1") = (sp_digit)d0_p;
+    register sp_digit div __asm__ ("r2") = (sp_digit)div_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -37492,12 +37492,12 @@ WC_OMIT_FRAME_POINTER static sp_digit div_3072_word_48(sp_digit d1_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit div_3072_word_48(sp_digit d1, sp_digit d0,
     sp_digit div)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit d1 asm ("r0") = (sp_digit)d1_p;
-    register sp_digit d0 asm ("r1") = (sp_digit)d0_p;
-    register sp_digit div asm ("r2") = (sp_digit)div_p;
+    register sp_digit d1 __asm__ ("r0") = (sp_digit)d1_p;
+    register sp_digit d0 __asm__ ("r1") = (sp_digit)d0_p;
+    register sp_digit div __asm__ ("r2") = (sp_digit)div_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -37515,7 +37515,7 @@ WC_OMIT_FRAME_POINTER static sp_digit div_3072_word_48(sp_digit d1, sp_digit d0,
         /* Next 30 bits */
         "mov	r12, #29\n\t"
         "\n"
-    "L_div_3072_word_48_bit_%=: \n\t"
+    "L_div_3072_word_48_bit_%=:\n\t"
         "lsls	r4, r4, #1\n\t"
         "adc	r5, r5, r5\n\t"
         "subs	r6, lr, r5\n\t"
@@ -37641,11 +37641,11 @@ WC_OMIT_FRAME_POINTER static sp_int32 sp_3072_cmp_48(const sp_digit* a_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_int32 sp_3072_cmp_48(const sp_digit* a,
     const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register const sp_digit* a asm ("r0") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r1") = (const sp_digit*)b_p;
+    register const sp_digit* a __asm__ ("r0") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r1") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -37656,7 +37656,7 @@ WC_OMIT_FRAME_POINTER static sp_int32 sp_3072_cmp_48(const sp_digit* a,
 #ifdef WOLFSSL_SP_SMALL
         "mov	r4, #0xbc\n\t"
         "\n"
-    "L_sp_3072_cmp_48_words_%=: \n\t"
+    "L_sp_3072_cmp_48_words_%=:\n\t"
         "ldr	r12, [%[a], r4]\n\t"
         "ldr	lr, [%[b], r4]\n\t"
         "and	r12, r12, r3\n\t"
@@ -38597,13 +38597,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_3072_cond_sub_96(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_3072_cond_sub_96(sp_digit* r,
     const sp_digit* a, const sp_digit* b, sp_digit m)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
-    register sp_digit m asm ("r3") = (sp_digit)m_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
+    register sp_digit m __asm__ ("r3") = (sp_digit)m_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -38611,7 +38611,7 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_3072_cond_sub_96(sp_digit* r,
         "mov	r12, #0\n\t"
         "mov	lr, #0\n\t"
         "\n"
-    "L_sp_3072_cond_sub_96_words_%=: \n\t"
+    "L_sp_3072_cond_sub_96_words_%=:\n\t"
         "subs	r12, r6, r12\n\t"
         "ldr	r4, [%[a], lr]\n\t"
         "ldr	r5, [%[b], lr]\n\t"
@@ -38650,13 +38650,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_3072_cond_sub_96(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_3072_cond_sub_96(sp_digit* r,
     const sp_digit* a, const sp_digit* b, sp_digit m)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
-    register sp_digit m asm ("r3") = (sp_digit)m_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
+    register sp_digit m __asm__ ("r3") = (sp_digit)m_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -39024,12 +39024,12 @@ WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_3072_mont_reduce_96(
 #else
 WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_3072_mont_reduce_96(
     sp_digit* a, const sp_digit* m, sp_digit mp)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* a asm ("r0") = (sp_digit*)a_p;
-    register const sp_digit* m asm ("r1") = (const sp_digit*)m_p;
-    register sp_digit mp asm ("r2") = (sp_digit)mp_p;
+    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
+    register const sp_digit* m __asm__ ("r1") = (const sp_digit*)m_p;
+    register sp_digit mp __asm__ ("r2") = (sp_digit)mp_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -39042,7 +39042,7 @@ WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_3072_mont_reduce_96(
         "ldr	r12, [%[a]]\n\t"
         "ldr	lr, [%[a], #4]\n\t"
         "\n"
-    "L_sp_3072_mont_reduce_96_word_%=: \n\t"
+    "L_sp_3072_mont_reduce_96_word_%=:\n\t"
         /* mu = a[i] * mp */
         "mul	r8, %[mp], r12\n\t"
         /* a[i+0] += m[0] * mu */
@@ -41874,12 +41874,12 @@ WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_3072_mont_reduce_96(
 #else
 WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_3072_mont_reduce_96(
     sp_digit* a, const sp_digit* m, sp_digit mp)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* a asm ("r0") = (sp_digit*)a_p;
-    register const sp_digit* m asm ("r1") = (const sp_digit*)m_p;
-    register sp_digit mp asm ("r2") = (sp_digit)mp_p;
+    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
+    register const sp_digit* m __asm__ ("r1") = (const sp_digit*)m_p;
+    register sp_digit mp __asm__ ("r2") = (sp_digit)mp_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -41890,7 +41890,7 @@ WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_3072_mont_reduce_96(
         "ldr	r12, [%[a]]\n\t"
         "ldr	lr, [%[a], #4]\n\t"
         "\n"
-    "L_sp_3072_mont_reduce_96_word_%=: \n\t"
+    "L_sp_3072_mont_reduce_96_word_%=:\n\t"
         /* mu = a[i] * mp */
         "mul	r8, %[mp], r12\n\t"
         /* a[i+0] += m[0] * mu */
@@ -42695,12 +42695,12 @@ WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_3072_mont_reduce_96(
 #else
 WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_3072_mont_reduce_96(
     sp_digit* a, const sp_digit* m, sp_digit mp)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* a asm ("r0") = (sp_digit*)a_p;
-    register const sp_digit* m asm ("r1") = (const sp_digit*)m_p;
-    register sp_digit mp asm ("r2") = (sp_digit)mp_p;
+    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
+    register const sp_digit* m __asm__ ("r1") = (const sp_digit*)m_p;
+    register sp_digit mp __asm__ ("r2") = (sp_digit)mp_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -42713,7 +42713,7 @@ WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_3072_mont_reduce_96(
         "ldr	r7, [%[a], #12]\n\t"
         "ldr	r8, [%[a], #16]\n\t"
         "\n"
-    "L_sp_3072_mont_reduce_96_word_%=: \n\t"
+    "L_sp_3072_mont_reduce_96_word_%=:\n\t"
         /* mu = a[i] * mp */
         "mul	r11, %[mp], r4\n\t"
         /* a[i+0] += m[0] * mu */
@@ -43265,19 +43265,19 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_3072_sub_96(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_3072_sub_96(sp_digit* r,
     const sp_digit* a, const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
         "mov	r12, #0\n\t"
         "add	lr, %[a], #0x180\n\t"
         "\n"
-    "L_sp_3072_sub_96_word_%=: \n\t"
+    "L_sp_3072_sub_96_word_%=:\n\t"
         "rsbs	r12, r12, #0\n\t"
         "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
         "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
@@ -43316,12 +43316,12 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_3072_sub_96(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_3072_sub_96(sp_digit* r,
     const sp_digit* a, const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -43523,12 +43523,12 @@ WC_OMIT_FRAME_POINTER static sp_digit div_3072_word_96(sp_digit d1_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit div_3072_word_96(sp_digit d1, sp_digit d0,
     sp_digit div)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit d1 asm ("r0") = (sp_digit)d1_p;
-    register sp_digit d0 asm ("r1") = (sp_digit)d0_p;
-    register sp_digit div asm ("r2") = (sp_digit)div_p;
+    register sp_digit d1 __asm__ ("r0") = (sp_digit)d1_p;
+    register sp_digit d0 __asm__ ("r1") = (sp_digit)d0_p;
+    register sp_digit div __asm__ ("r2") = (sp_digit)div_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -43595,12 +43595,12 @@ WC_OMIT_FRAME_POINTER static sp_digit div_3072_word_96(sp_digit d1_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit div_3072_word_96(sp_digit d1, sp_digit d0,
     sp_digit div)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit d1 asm ("r0") = (sp_digit)d1_p;
-    register sp_digit d0 asm ("r1") = (sp_digit)d0_p;
-    register sp_digit div asm ("r2") = (sp_digit)div_p;
+    register sp_digit d1 __asm__ ("r0") = (sp_digit)d1_p;
+    register sp_digit d0 __asm__ ("r1") = (sp_digit)d0_p;
+    register sp_digit div __asm__ ("r2") = (sp_digit)div_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -43618,7 +43618,7 @@ WC_OMIT_FRAME_POINTER static sp_digit div_3072_word_96(sp_digit d1, sp_digit d0,
         /* Next 30 bits */
         "mov	r12, #29\n\t"
         "\n"
-    "L_div_3072_word_96_bit_%=: \n\t"
+    "L_div_3072_word_96_bit_%=:\n\t"
         "lsls	r4, r4, #1\n\t"
         "adc	r5, r5, r5\n\t"
         "subs	r6, lr, r5\n\t"
@@ -43848,11 +43848,11 @@ WC_OMIT_FRAME_POINTER static sp_int32 sp_3072_cmp_96(const sp_digit* a_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_int32 sp_3072_cmp_96(const sp_digit* a,
     const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register const sp_digit* a asm ("r0") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r1") = (const sp_digit*)b_p;
+    register const sp_digit* a __asm__ ("r0") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r1") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -43868,7 +43868,7 @@ WC_OMIT_FRAME_POINTER static sp_int32 sp_3072_cmp_96(const sp_digit* a,
         "mov	r4, #0x17c\n\t"
 #endif
         "\n"
-    "L_sp_3072_cmp_96_words_%=: \n\t"
+    "L_sp_3072_cmp_96_words_%=:\n\t"
         "ldr	r12, [%[a], r4]\n\t"
         "ldr	lr, [%[b], r4]\n\t"
         "and	r12, r12, r3\n\t"
@@ -45446,13 +45446,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_3072_cond_add_48(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_3072_cond_add_48(sp_digit* r,
     const sp_digit* a, const sp_digit* b, sp_digit m)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
-    register sp_digit m asm ("r3") = (sp_digit)m_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
+    register sp_digit m __asm__ ("r3") = (sp_digit)m_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -45460,7 +45460,7 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_3072_cond_add_48(sp_digit* r,
         "mov	r6, #0\n\t"
         "mov	r12, #0\n\t"
         "\n"
-    "L_sp_3072_cond_add_48_words_%=: \n\t"
+    "L_sp_3072_cond_add_48_words_%=:\n\t"
         "adds	lr, lr, #-1\n\t"
         "ldr	r4, [%[a], r12]\n\t"
         "ldr	r5, [%[b], r12]\n\t"
@@ -45499,13 +45499,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_3072_cond_add_48(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_3072_cond_add_48(sp_digit* r,
     const sp_digit* a, const sp_digit* b, sp_digit m)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
-    register sp_digit m asm ("r3") = (sp_digit)m_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
+    register sp_digit m __asm__ ("r3") = (sp_digit)m_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -45971,12 +45971,12 @@ WC_OMIT_FRAME_POINTER static void sp_3072_lshift_96(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static void sp_3072_lshift_96(sp_digit* r,
     const sp_digit* a, byte n)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register byte n asm ("r2") = (byte)n_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register byte n __asm__ ("r2") = (byte)n_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -46983,11 +46983,11 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_4096_sub_in_place_128(sp_digit* a_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_4096_sub_in_place_128(sp_digit* a,
     const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* a asm ("r0") = (sp_digit*)a_p;
-    register const sp_digit* b asm ("r1") = (const sp_digit*)b_p;
+    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r1") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -47240,12 +47240,12 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_4096_add_128(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_4096_add_128(sp_digit* r,
     const sp_digit* a, const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -47576,19 +47576,19 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_4096_add_128(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_4096_add_128(sp_digit* r,
     const sp_digit* a, const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
         "mov	r3, #0\n\t"
         "add	r12, %[a], #0x200\n\t"
         "\n"
-    "L_sp_4096_add_128_word_%=: \n\t"
+    "L_sp_4096_add_128_word_%=:\n\t"
         "adds	r3, r3, #-1\n\t"
         "ldm	%[a]!, {r4, r5, r6, r7}\n\t"
         "ldm	%[b]!, {r8, r9, r10, r11}\n\t"
@@ -47628,18 +47628,18 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_4096_sub_in_place_128(sp_digit* a_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_4096_sub_in_place_128(sp_digit* a,
     const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* a asm ("r0") = (sp_digit*)a_p;
-    register const sp_digit* b asm ("r1") = (const sp_digit*)b_p;
+    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r1") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
         "mov	r12, #0\n\t"
         "add	lr, %[a], #0x200\n\t"
         "\n"
-    "L_sp_4096_sub_in_place_128_word_%=: \n\t"
+    "L_sp_4096_sub_in_place_128_word_%=:\n\t"
         "rsbs	r12, r12, #0\n\t"
         "ldm	%[a], {r2, r3, r4, r5}\n\t"
         "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
@@ -47679,12 +47679,12 @@ WC_OMIT_FRAME_POINTER static void sp_4096_mul_128(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static void sp_4096_mul_128(sp_digit* r,
     const sp_digit* a, const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -47697,13 +47697,13 @@ WC_OMIT_FRAME_POINTER static void sp_4096_mul_128(sp_digit* r,
         "mov	r8, #0\n\t"
         "mov	r5, #4\n\t"
         "\n"
-    "L_sp_4096_mul_128_outer_%=: \n\t"
+    "L_sp_4096_mul_128_outer_%=:\n\t"
         "subs	r3, r5, #0x1fc\n\t"
         "it	cc\n\t"
         "movcc	r3, #0\n\t"
         "sub	r4, r5, r3\n\t"
         "\n"
-    "L_sp_4096_mul_128_inner_%=: \n\t"
+    "L_sp_4096_mul_128_inner_%=:\n\t"
         "ldr	lr, [%[a], r3]\n\t"
         "ldr	r11, [%[b], r4]\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
@@ -47821,7 +47821,7 @@ WC_OMIT_FRAME_POINTER static void sp_4096_mul_128(sp_digit* r,
         "adc	r8, r8, #0\n\t"
 #endif
         "\n"
-    "L_sp_4096_mul_128_inner_done_%=: \n\t"
+    "L_sp_4096_mul_128_inner_done_%=:\n\t"
         "str	r6, [sp, r5]\n\t"
         "mov	r6, r7\n\t"
         "mov	r7, r8\n\t"
@@ -47863,7 +47863,7 @@ WC_OMIT_FRAME_POINTER static void sp_4096_mul_128(sp_digit* r,
         "add	r5, r5, #4\n\t"
         "str	r7, [sp, r5]\n\t"
         "\n"
-    "L_sp_4096_mul_128_store_%=: \n\t"
+    "L_sp_4096_mul_128_store_%=:\n\t"
         "ldm	sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
         "stm	%[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
         "subs	r5, r5, #32\n\t"
@@ -47891,11 +47891,11 @@ WC_OMIT_FRAME_POINTER static void sp_4096_sqr_128(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static void sp_4096_sqr_128(sp_digit* r,
     const sp_digit* a)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -47907,13 +47907,13 @@ WC_OMIT_FRAME_POINTER static void sp_4096_sqr_128(sp_digit* r,
         "mov	r8, #0\n\t"
         "mov	r5, #4\n\t"
         "\n"
-    "L_sp_4096_sqr_128_outer_%=: \n\t"
+    "L_sp_4096_sqr_128_outer_%=:\n\t"
         "subs	r3, r5, #0x1fc\n\t"
         "it	cc\n\t"
         "movcc	r3, #0\n\t"
         "sub	r4, r5, r3\n\t"
         "\n"
-    "L_sp_4096_sqr_128_inner_%=: \n\t"
+    "L_sp_4096_sqr_128_inner_%=:\n\t"
         "ldr	lr, [%[a], r3]\n\t"
         "ldr	r11, [%[a], r4]\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
@@ -47998,7 +47998,7 @@ WC_OMIT_FRAME_POINTER static void sp_4096_sqr_128(sp_digit* r,
         "adc	r8, r8, #0\n\t"
 #endif
         "\n"
-    "L_sp_4096_sqr_128_inner_done_%=: \n\t"
+    "L_sp_4096_sqr_128_inner_done_%=:\n\t"
         "str	r6, [sp, r5]\n\t"
         "mov	r6, r7\n\t"
         "mov	r7, r8\n\t"
@@ -48034,7 +48034,7 @@ WC_OMIT_FRAME_POINTER static void sp_4096_sqr_128(sp_digit* r,
         "add	r5, r5, #4\n\t"
         "str	r7, [sp, r5]\n\t"
         "\n"
-    "L_sp_4096_sqr_128_store_%=: \n\t"
+    "L_sp_4096_sqr_128_store_%=:\n\t"
         "ldm	sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
         "stm	%[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
         "subs	r5, r5, #32\n\t"
@@ -48085,12 +48085,12 @@ WC_OMIT_FRAME_POINTER static void sp_4096_mul_d_128(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static void sp_4096_mul_d_128(sp_digit* r,
     const sp_digit* a, sp_digit b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register sp_digit b asm ("r2") = (sp_digit)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register sp_digit b __asm__ ("r2") = (sp_digit)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -48126,7 +48126,7 @@ WC_OMIT_FRAME_POINTER static void sp_4096_mul_d_128(sp_digit* r,
         "mov	r5, #0\n\t"
         "mov	r9, #4\n\t"
         "\n"
-    "L_sp_4096_mul_d_128_word_%=: \n\t"
+    "L_sp_4096_mul_d_128_word_%=:\n\t"
         /* A[i] * B */
         "ldr	r8, [%[a], r9]\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
@@ -48196,12 +48196,12 @@ WC_OMIT_FRAME_POINTER static void sp_4096_mul_d_128(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static void sp_4096_mul_d_128(sp_digit* r,
     const sp_digit* a, sp_digit b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register sp_digit b asm ("r2") = (sp_digit)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register sp_digit b __asm__ ("r2") = (sp_digit)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -52341,13 +52341,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_4096_cond_sub_128(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_4096_cond_sub_128(sp_digit* r,
     const sp_digit* a, const sp_digit* b, sp_digit m)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
-    register sp_digit m asm ("r3") = (sp_digit)m_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
+    register sp_digit m __asm__ ("r3") = (sp_digit)m_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -52355,7 +52355,7 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_4096_cond_sub_128(sp_digit* r,
         "mov	r12, #0\n\t"
         "mov	lr, #0\n\t"
         "\n"
-    "L_sp_4096_cond_sub_128_words_%=: \n\t"
+    "L_sp_4096_cond_sub_128_words_%=:\n\t"
         "subs	r12, r6, r12\n\t"
         "ldr	r4, [%[a], lr]\n\t"
         "ldr	r5, [%[b], lr]\n\t"
@@ -52394,13 +52394,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_4096_cond_sub_128(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_4096_cond_sub_128(sp_digit* r,
     const sp_digit* a, const sp_digit* b, sp_digit m)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
-    register sp_digit m asm ("r3") = (sp_digit)m_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
+    register sp_digit m __asm__ ("r3") = (sp_digit)m_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -52880,12 +52880,12 @@ WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_4096_mont_reduce_128(
 #else
 WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_4096_mont_reduce_128(
     sp_digit* a, const sp_digit* m, sp_digit mp)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* a asm ("r0") = (sp_digit*)a_p;
-    register const sp_digit* m asm ("r1") = (const sp_digit*)m_p;
-    register sp_digit mp asm ("r2") = (sp_digit)mp_p;
+    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
+    register const sp_digit* m __asm__ ("r1") = (const sp_digit*)m_p;
+    register sp_digit mp __asm__ ("r2") = (sp_digit)mp_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -52898,7 +52898,7 @@ WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_4096_mont_reduce_128(
         "ldr	r12, [%[a]]\n\t"
         "ldr	lr, [%[a], #4]\n\t"
         "\n"
-    "L_sp_4096_mont_reduce_128_word_%=: \n\t"
+    "L_sp_4096_mont_reduce_128_word_%=:\n\t"
         /* mu = a[i] * mp */
         "mul	r8, %[mp], r12\n\t"
         /* a[i+0] += m[0] * mu */
@@ -56658,12 +56658,12 @@ WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_4096_mont_reduce_128(
 #else
 WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_4096_mont_reduce_128(
     sp_digit* a, const sp_digit* m, sp_digit mp)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* a asm ("r0") = (sp_digit*)a_p;
-    register const sp_digit* m asm ("r1") = (const sp_digit*)m_p;
-    register sp_digit mp asm ("r2") = (sp_digit)mp_p;
+    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
+    register const sp_digit* m __asm__ ("r1") = (const sp_digit*)m_p;
+    register sp_digit mp __asm__ ("r2") = (sp_digit)mp_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -56674,7 +56674,7 @@ WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_4096_mont_reduce_128(
         "ldr	r12, [%[a]]\n\t"
         "ldr	lr, [%[a], #4]\n\t"
         "\n"
-    "L_sp_4096_mont_reduce_128_word_%=: \n\t"
+    "L_sp_4096_mont_reduce_128_word_%=:\n\t"
         /* mu = a[i] * mp */
         "mul	r8, %[mp], r12\n\t"
         /* a[i+0] += m[0] * mu */
@@ -57735,12 +57735,12 @@ WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_4096_mont_reduce_128(
 #else
 WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_4096_mont_reduce_128(
     sp_digit* a, const sp_digit* m, sp_digit mp)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* a asm ("r0") = (sp_digit*)a_p;
-    register const sp_digit* m asm ("r1") = (const sp_digit*)m_p;
-    register sp_digit mp asm ("r2") = (sp_digit)mp_p;
+    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
+    register const sp_digit* m __asm__ ("r1") = (const sp_digit*)m_p;
+    register sp_digit mp __asm__ ("r2") = (sp_digit)mp_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -57753,7 +57753,7 @@ WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_4096_mont_reduce_128(
         "ldr	r7, [%[a], #12]\n\t"
         "ldr	r8, [%[a], #16]\n\t"
         "\n"
-    "L_sp_4096_mont_reduce_128_word_%=: \n\t"
+    "L_sp_4096_mont_reduce_128_word_%=:\n\t"
         /* mu = a[i] * mp */
         "mul	r11, %[mp], r4\n\t"
         /* a[i+0] += m[0] * mu */
@@ -58465,19 +58465,19 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_4096_sub_128(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_4096_sub_128(sp_digit* r,
     const sp_digit* a, const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
         "mov	r12, #0\n\t"
         "add	lr, %[a], #0x200\n\t"
         "\n"
-    "L_sp_4096_sub_128_word_%=: \n\t"
+    "L_sp_4096_sub_128_word_%=:\n\t"
         "rsbs	r12, r12, #0\n\t"
         "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
         "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
@@ -58516,12 +58516,12 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_4096_sub_128(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_4096_sub_128(sp_digit* r,
     const sp_digit* a, const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -58779,12 +58779,12 @@ WC_OMIT_FRAME_POINTER static sp_digit div_4096_word_128(sp_digit d1_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit div_4096_word_128(sp_digit d1,
     sp_digit d0, sp_digit div)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit d1 asm ("r0") = (sp_digit)d1_p;
-    register sp_digit d0 asm ("r1") = (sp_digit)d0_p;
-    register sp_digit div asm ("r2") = (sp_digit)div_p;
+    register sp_digit d1 __asm__ ("r0") = (sp_digit)d1_p;
+    register sp_digit d0 __asm__ ("r1") = (sp_digit)d0_p;
+    register sp_digit div __asm__ ("r2") = (sp_digit)div_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -58851,12 +58851,12 @@ WC_OMIT_FRAME_POINTER static sp_digit div_4096_word_128(sp_digit d1_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit div_4096_word_128(sp_digit d1,
     sp_digit d0, sp_digit div)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit d1 asm ("r0") = (sp_digit)d1_p;
-    register sp_digit d0 asm ("r1") = (sp_digit)d0_p;
-    register sp_digit div asm ("r2") = (sp_digit)div_p;
+    register sp_digit d1 __asm__ ("r0") = (sp_digit)d1_p;
+    register sp_digit d0 __asm__ ("r1") = (sp_digit)d0_p;
+    register sp_digit div __asm__ ("r2") = (sp_digit)div_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -58874,7 +58874,7 @@ WC_OMIT_FRAME_POINTER static sp_digit div_4096_word_128(sp_digit d1,
         /* Next 30 bits */
         "mov	r12, #29\n\t"
         "\n"
-    "L_div_4096_word_128_bit_%=: \n\t"
+    "L_div_4096_word_128_bit_%=:\n\t"
         "lsls	r4, r4, #1\n\t"
         "adc	r5, r5, r5\n\t"
         "subs	r6, lr, r5\n\t"
@@ -59104,11 +59104,11 @@ WC_OMIT_FRAME_POINTER static sp_int32 sp_4096_cmp_128(const sp_digit* a_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_int32 sp_4096_cmp_128(const sp_digit* a,
     const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register const sp_digit* a asm ("r0") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r1") = (const sp_digit*)b_p;
+    register const sp_digit* a __asm__ ("r0") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r1") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -59124,7 +59124,7 @@ WC_OMIT_FRAME_POINTER static sp_int32 sp_4096_cmp_128(const sp_digit* a,
         "mov	r4, #0x1fc\n\t"
 #endif
         "\n"
-    "L_sp_4096_cmp_128_words_%=: \n\t"
+    "L_sp_4096_cmp_128_words_%=:\n\t"
         "ldr	r12, [%[a], r4]\n\t"
         "ldr	lr, [%[b], r4]\n\t"
         "and	r12, r12, r3\n\t"
@@ -61054,13 +61054,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_4096_cond_add_64(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_4096_cond_add_64(sp_digit* r,
     const sp_digit* a, const sp_digit* b, sp_digit m)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
-    register sp_digit m asm ("r3") = (sp_digit)m_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
+    register sp_digit m __asm__ ("r3") = (sp_digit)m_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -61068,7 +61068,7 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_4096_cond_add_64(sp_digit* r,
         "mov	r6, #0\n\t"
         "mov	r12, #0\n\t"
         "\n"
-    "L_sp_4096_cond_add_64_words_%=: \n\t"
+    "L_sp_4096_cond_add_64_words_%=:\n\t"
         "adds	lr, lr, #-1\n\t"
         "ldr	r4, [%[a], r12]\n\t"
         "ldr	r5, [%[b], r12]\n\t"
@@ -61107,13 +61107,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_4096_cond_add_64(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_4096_cond_add_64(sp_digit* r,
     const sp_digit* a, const sp_digit* b, sp_digit m)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
-    register sp_digit m asm ("r3") = (sp_digit)m_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
+    register sp_digit m __asm__ ("r3") = (sp_digit)m_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -61635,12 +61635,12 @@ WC_OMIT_FRAME_POINTER static void sp_4096_lshift_128(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static void sp_4096_lshift_128(sp_digit* r,
     const sp_digit* a, byte n)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register byte n asm ("r2") = (byte)n_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register byte n __asm__ ("r2") = (byte)n_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -62707,12 +62707,12 @@ WC_OMIT_FRAME_POINTER static void sp_256_mul_8(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static void sp_256_mul_8(sp_digit* r, const sp_digit* a,
     const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -62725,13 +62725,13 @@ WC_OMIT_FRAME_POINTER static void sp_256_mul_8(sp_digit* r, const sp_digit* a,
         "mov	r8, #0\n\t"
         "mov	r5, #4\n\t"
         "\n"
-    "L_sp_256_mul_8_outer_%=: \n\t"
+    "L_sp_256_mul_8_outer_%=:\n\t"
         "subs	r3, r5, #28\n\t"
         "it	cc\n\t"
         "movcc	r3, #0\n\t"
         "sub	r4, r5, r3\n\t"
         "\n"
-    "L_sp_256_mul_8_inner_%=: \n\t"
+    "L_sp_256_mul_8_inner_%=:\n\t"
         "ldr	lr, [%[a], r3]\n\t"
         "ldr	r11, [%[b], r4]\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
@@ -62849,7 +62849,7 @@ WC_OMIT_FRAME_POINTER static void sp_256_mul_8(sp_digit* r, const sp_digit* a,
         "adc	r8, r8, #0\n\t"
 #endif
         "\n"
-    "L_sp_256_mul_8_inner_done_%=: \n\t"
+    "L_sp_256_mul_8_inner_done_%=:\n\t"
         "str	r6, [sp, r5]\n\t"
         "mov	r6, r7\n\t"
         "mov	r7, r8\n\t"
@@ -62891,7 +62891,7 @@ WC_OMIT_FRAME_POINTER static void sp_256_mul_8(sp_digit* r, const sp_digit* a,
         "add	r5, r5, #4\n\t"
         "str	r7, [sp, r5]\n\t"
         "\n"
-    "L_sp_256_mul_8_store_%=: \n\t"
+    "L_sp_256_mul_8_store_%=:\n\t"
         "ldm	sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
         "stm	%[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
         "subs	r5, r5, #32\n\t"
@@ -62922,12 +62922,12 @@ WC_OMIT_FRAME_POINTER static void sp_256_mul_8(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static void sp_256_mul_8(sp_digit* r, const sp_digit* a,
     const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -64931,12 +64931,12 @@ WC_OMIT_FRAME_POINTER static void sp_256_mul_8(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static void sp_256_mul_8(sp_digit* r, const sp_digit* a,
     const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -65298,12 +65298,12 @@ WC_OMIT_FRAME_POINTER static void sp_256_mul_8(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static void sp_256_mul_8(sp_digit* r, const sp_digit* a,
     const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -65443,11 +65443,11 @@ WC_OMIT_FRAME_POINTER static void sp_256_sqr_8(sp_digit* r_p,
     const sp_digit* a_p)
 #else
 WC_OMIT_FRAME_POINTER static void sp_256_sqr_8(sp_digit* r, const sp_digit* a)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -65459,13 +65459,13 @@ WC_OMIT_FRAME_POINTER static void sp_256_sqr_8(sp_digit* r, const sp_digit* a)
         "mov	r8, #0\n\t"
         "mov	r5, #4\n\t"
         "\n"
-    "L_sp_256_sqr_8_outer_%=: \n\t"
+    "L_sp_256_sqr_8_outer_%=:\n\t"
         "subs	r3, r5, #28\n\t"
         "it	cc\n\t"
         "movcc	r3, #0\n\t"
         "sub	r4, r5, r3\n\t"
         "\n"
-    "L_sp_256_sqr_8_inner_%=: \n\t"
+    "L_sp_256_sqr_8_inner_%=:\n\t"
         "ldr	lr, [%[a], r3]\n\t"
         "ldr	r11, [%[a], r4]\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
@@ -65550,7 +65550,7 @@ WC_OMIT_FRAME_POINTER static void sp_256_sqr_8(sp_digit* r, const sp_digit* a)
         "adc	r8, r8, #0\n\t"
 #endif
         "\n"
-    "L_sp_256_sqr_8_inner_done_%=: \n\t"
+    "L_sp_256_sqr_8_inner_done_%=:\n\t"
         "str	r6, [sp, r5]\n\t"
         "mov	r6, r7\n\t"
         "mov	r7, r8\n\t"
@@ -65586,7 +65586,7 @@ WC_OMIT_FRAME_POINTER static void sp_256_sqr_8(sp_digit* r, const sp_digit* a)
         "add	r5, r5, #4\n\t"
         "str	r7, [sp, r5]\n\t"
         "\n"
-    "L_sp_256_sqr_8_store_%=: \n\t"
+    "L_sp_256_sqr_8_store_%=:\n\t"
         "ldm	sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
         "stm	%[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
         "subs	r5, r5, #32\n\t"
@@ -65615,11 +65615,11 @@ WC_OMIT_FRAME_POINTER static void sp_256_sqr_8(sp_digit* r_p,
     const sp_digit* a_p)
 #else
 WC_OMIT_FRAME_POINTER static void sp_256_sqr_8(sp_digit* r, const sp_digit* a)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -66838,11 +66838,11 @@ WC_OMIT_FRAME_POINTER static void sp_256_sqr_8(sp_digit* r_p,
     const sp_digit* a_p)
 #else
 WC_OMIT_FRAME_POINTER static void sp_256_sqr_8(sp_digit* r, const sp_digit* a)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -67094,11 +67094,11 @@ WC_OMIT_FRAME_POINTER static void sp_256_sqr_8(sp_digit* r_p,
     const sp_digit* a_p)
 #else
 WC_OMIT_FRAME_POINTER static void sp_256_sqr_8(sp_digit* r, const sp_digit* a)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -67227,19 +67227,19 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_256_add_8(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_256_add_8(sp_digit* r,
     const sp_digit* a, const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
         "mov	r3, #0\n\t"
         "add	r12, %[a], #32\n\t"
         "\n"
-    "L_sp_256_add_8_word_%=: \n\t"
+    "L_sp_256_add_8_word_%=:\n\t"
         "adds	r3, r3, #-1\n\t"
         "ldm	%[a]!, {r4, r5, r6, r7}\n\t"
         "ldm	%[b]!, {r8, r9, r10, r11}\n\t"
@@ -67279,12 +67279,12 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_256_add_8(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_256_add_8(sp_digit* r,
     const sp_digit* a, const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -67329,11 +67329,11 @@ WC_OMIT_FRAME_POINTER static int sp_256_mod_mul_norm_8(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static int sp_256_mod_mul_norm_8(sp_digit* r,
     const sp_digit* a, const sp_digit* m)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -67783,12 +67783,12 @@ WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r,
     const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -69926,12 +69926,12 @@ WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r,
     const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -70426,12 +70426,12 @@ WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r,
     const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -70702,11 +70702,11 @@ WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r,
     const sp_digit* a, const sp_digit* m, sp_digit mp)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -71921,11 +71921,11 @@ WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r,
     const sp_digit* a, const sp_digit* m, sp_digit mp)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -72309,11 +72309,11 @@ WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r,
     const sp_digit* a, const sp_digit* m, sp_digit mp)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -72667,11 +72667,11 @@ WC_OMIT_FRAME_POINTER static sp_int32 sp_256_cmp_8(const sp_digit* a_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_int32 sp_256_cmp_8(const sp_digit* a,
     const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register const sp_digit* a asm ("r0") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r1") = (const sp_digit*)b_p;
+    register const sp_digit* a __asm__ ("r0") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r1") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -72682,7 +72682,7 @@ WC_OMIT_FRAME_POINTER static sp_int32 sp_256_cmp_8(const sp_digit* a,
 #ifdef WOLFSSL_SP_SMALL
         "mov	r4, #28\n\t"
         "\n"
-    "L_sp_256_cmp_8_words_%=: \n\t"
+    "L_sp_256_cmp_8_words_%=:\n\t"
         "ldr	r12, [%[a], r4]\n\t"
         "ldr	lr, [%[b], r4]\n\t"
         "and	r12, r12, r3\n\t"
@@ -72822,13 +72822,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_256_cond_sub_8(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_256_cond_sub_8(sp_digit* r,
     const sp_digit* a, const sp_digit* b, sp_digit m)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
-    register sp_digit m asm ("r3") = (sp_digit)m_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
+    register sp_digit m __asm__ ("r3") = (sp_digit)m_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -72836,7 +72836,7 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_256_cond_sub_8(sp_digit* r,
         "mov	r12, #0\n\t"
         "mov	lr, #0\n\t"
         "\n"
-    "L_sp_256_cond_sub_8_words_%=: \n\t"
+    "L_sp_256_cond_sub_8_words_%=:\n\t"
         "subs	r12, r6, r12\n\t"
         "ldr	r4, [%[a], lr]\n\t"
         "ldr	r5, [%[b], lr]\n\t"
@@ -72875,13 +72875,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_256_cond_sub_8(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_256_cond_sub_8(sp_digit* r,
     const sp_digit* a, const sp_digit* b, sp_digit m)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
-    register sp_digit m asm ("r3") = (sp_digit)m_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
+    register sp_digit m __asm__ ("r3") = (sp_digit)m_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -72944,12 +72944,12 @@ WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_mont_reduce_8(
 #else
 WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_mont_reduce_8(sp_digit* a,
     const sp_digit* m, sp_digit mp)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* a asm ("r0") = (sp_digit*)a_p;
-    register const sp_digit* m asm ("r1") = (const sp_digit*)m_p;
-    register sp_digit mp asm ("r2") = (sp_digit)mp_p;
+    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
+    register const sp_digit* m __asm__ ("r1") = (const sp_digit*)m_p;
+    register sp_digit mp __asm__ ("r2") = (sp_digit)mp_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -72962,7 +72962,7 @@ WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_mont_reduce_8(sp_digit* a,
         "ldr	r12, [%[a]]\n\t"
         "ldr	lr, [%[a], #4]\n\t"
         "\n"
-    "L_sp_256_mont_reduce_8_word_%=: \n\t"
+    "L_sp_256_mont_reduce_8_word_%=:\n\t"
         /* mu = a[i] * mp */
         "mul	r8, %[mp], r12\n\t"
         /* a[i+0] += m[0] * mu */
@@ -73242,12 +73242,12 @@ WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_mont_reduce_8(
 #else
 WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_mont_reduce_8(sp_digit* a,
     const sp_digit* m, sp_digit mp)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* a asm ("r0") = (sp_digit*)a_p;
-    register const sp_digit* m asm ("r1") = (const sp_digit*)m_p;
-    register sp_digit mp asm ("r2") = (sp_digit)mp_p;
+    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
+    register const sp_digit* m __asm__ ("r1") = (const sp_digit*)m_p;
+    register sp_digit mp __asm__ ("r2") = (sp_digit)mp_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -73258,7 +73258,7 @@ WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_mont_reduce_8(sp_digit* a,
         "ldr	r12, [%[a]]\n\t"
         "ldr	lr, [%[a], #4]\n\t"
         "\n"
-    "L_sp_256_mont_reduce_8_word_%=: \n\t"
+    "L_sp_256_mont_reduce_8_word_%=:\n\t"
         /* mu = a[i] * mp */
         "mul	r8, %[mp], r12\n\t"
         /* a[i+0] += m[0] * mu */
@@ -73359,12 +73359,12 @@ WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_mont_reduce_8(
 #else
 WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_mont_reduce_8(sp_digit* a,
     const sp_digit* m, sp_digit mp)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* a asm ("r0") = (sp_digit*)a_p;
-    register const sp_digit* m asm ("r1") = (const sp_digit*)m_p;
-    register sp_digit mp asm ("r2") = (sp_digit)mp_p;
+    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
+    register const sp_digit* m __asm__ ("r1") = (const sp_digit*)m_p;
+    register sp_digit mp __asm__ ("r2") = (sp_digit)mp_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -73377,7 +73377,7 @@ WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_mont_reduce_8(sp_digit* a,
         "ldr	r7, [%[a], #12]\n\t"
         "ldr	r8, [%[a], #16]\n\t"
         "\n"
-    "L_sp_256_mont_reduce_8_word_%=: \n\t"
+    "L_sp_256_mont_reduce_8_word_%=:\n\t"
         /* mu = a[i] * mp */
         "mul	r11, %[mp], r4\n\t"
         /* a[i+0] += m[0] * mu */
@@ -73459,10 +73459,10 @@ WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_mont_reduce_8(
 #else
 WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_mont_reduce_8(sp_digit* a,
     const sp_digit* m, sp_digit mp)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* a asm ("r0") = (sp_digit*)a_p;
+    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -73632,12 +73632,12 @@ WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_mont_reduce_order_8(
 #else
 WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_mont_reduce_order_8(
     sp_digit* a, const sp_digit* m, sp_digit mp)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* a asm ("r0") = (sp_digit*)a_p;
-    register const sp_digit* m asm ("r1") = (const sp_digit*)m_p;
-    register sp_digit mp asm ("r2") = (sp_digit)mp_p;
+    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
+    register const sp_digit* m __asm__ ("r1") = (const sp_digit*)m_p;
+    register sp_digit mp __asm__ ("r2") = (sp_digit)mp_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -73650,7 +73650,7 @@ WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_mont_reduce_order_8(
         "ldr	r12, [%[a]]\n\t"
         "ldr	lr, [%[a], #4]\n\t"
         "\n"
-    "L_sp_256_mont_reduce_order_8_word_%=: \n\t"
+    "L_sp_256_mont_reduce_order_8_word_%=:\n\t"
         /* mu = a[i] * mp */
         "mul	r8, %[mp], r12\n\t"
         /* a[i+0] += m[0] * mu */
@@ -73930,12 +73930,12 @@ WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_mont_reduce_order_8(
 #else
 WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_mont_reduce_order_8(
     sp_digit* a, const sp_digit* m, sp_digit mp)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* a asm ("r0") = (sp_digit*)a_p;
-    register const sp_digit* m asm ("r1") = (const sp_digit*)m_p;
-    register sp_digit mp asm ("r2") = (sp_digit)mp_p;
+    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
+    register const sp_digit* m __asm__ ("r1") = (const sp_digit*)m_p;
+    register sp_digit mp __asm__ ("r2") = (sp_digit)mp_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -73946,7 +73946,7 @@ WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_mont_reduce_order_8(
         "ldr	r12, [%[a]]\n\t"
         "ldr	lr, [%[a], #4]\n\t"
         "\n"
-    "L_sp_256_mont_reduce_order_8_word_%=: \n\t"
+    "L_sp_256_mont_reduce_order_8_word_%=:\n\t"
         /* mu = a[i] * mp */
         "mul	r8, %[mp], r12\n\t"
         /* a[i+0] += m[0] * mu */
@@ -74047,12 +74047,12 @@ WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_mont_reduce_order_8(
 #else
 WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_mont_reduce_order_8(
     sp_digit* a, const sp_digit* m, sp_digit mp)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* a asm ("r0") = (sp_digit*)a_p;
-    register const sp_digit* m asm ("r1") = (const sp_digit*)m_p;
-    register sp_digit mp asm ("r2") = (sp_digit)mp_p;
+    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
+    register const sp_digit* m __asm__ ("r1") = (const sp_digit*)m_p;
+    register sp_digit mp __asm__ ("r2") = (sp_digit)mp_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -74065,7 +74065,7 @@ WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_mont_reduce_order_8(
         "ldr	r7, [%[a], #12]\n\t"
         "ldr	r8, [%[a], #16]\n\t"
         "\n"
-    "L_sp_256_mont_reduce_order_8_word_%=: \n\t"
+    "L_sp_256_mont_reduce_order_8_word_%=:\n\t"
         /* mu = a[i] * mp */
         "mul	r11, %[mp], r4\n\t"
         /* a[i+0] += m[0] * mu */
@@ -74188,12 +74188,12 @@ WC_OMIT_FRAME_POINTER static void sp_256_mont_add_8(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static void sp_256_mont_add_8(sp_digit* r,
     const sp_digit* a, const sp_digit* b, const sp_digit* m)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -74261,11 +74261,11 @@ WC_OMIT_FRAME_POINTER static void sp_256_mont_dbl_8(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static void sp_256_mont_dbl_8(sp_digit* r,
     const sp_digit* a, const sp_digit* m)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -74329,11 +74329,11 @@ WC_OMIT_FRAME_POINTER static void sp_256_mont_tpl_8(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static void sp_256_mont_tpl_8(sp_digit* r,
     const sp_digit* a, const sp_digit* m)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -74430,12 +74430,12 @@ WC_OMIT_FRAME_POINTER static void sp_256_mont_sub_8(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static void sp_256_mont_sub_8(sp_digit* r,
     const sp_digit* a, const sp_digit* b, const sp_digit* m)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -74501,12 +74501,12 @@ WC_OMIT_FRAME_POINTER static void sp_256_mont_div2_8(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static void sp_256_mont_div2_8(sp_digit* r,
     const sp_digit* a, const sp_digit* m)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* m asm ("r2") = (const sp_digit*)m_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* m __asm__ ("r2") = (const sp_digit*)m_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -77886,10 +77886,10 @@ int sp_ecc_mulmod_base_add_256(const mp_int* km, const ecc_point* am,
 WC_OMIT_FRAME_POINTER static void sp_256_add_one_8(sp_digit* a_p)
 #else
 WC_OMIT_FRAME_POINTER static void sp_256_add_one_8(sp_digit* a)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* a asm ("r0") = (sp_digit*)a_p;
+    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -78271,18 +78271,18 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_256_sub_in_place_8(sp_digit* a_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_256_sub_in_place_8(sp_digit* a,
     const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* a asm ("r0") = (sp_digit*)a_p;
-    register const sp_digit* b asm ("r1") = (const sp_digit*)b_p;
+    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r1") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
         "mov	r12, #0\n\t"
         "add	lr, %[a], #32\n\t"
         "\n"
-    "L_sp_256_sub_in_place_8_word_%=: \n\t"
+    "L_sp_256_sub_in_place_8_word_%=:\n\t"
         "rsbs	r12, r12, #0\n\t"
         "ldm	%[a], {r2, r3, r4, r5}\n\t"
         "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
@@ -78320,11 +78320,11 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_256_sub_in_place_8(sp_digit* a_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_256_sub_in_place_8(sp_digit* a,
     const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* a asm ("r0") = (sp_digit*)a_p;
-    register const sp_digit* b asm ("r1") = (const sp_digit*)b_p;
+    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r1") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -78369,12 +78369,12 @@ WC_OMIT_FRAME_POINTER static void sp_256_mul_d_8(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static void sp_256_mul_d_8(sp_digit* r, const sp_digit* a,
     sp_digit b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register sp_digit b asm ("r2") = (sp_digit)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register sp_digit b __asm__ ("r2") = (sp_digit)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -78410,7 +78410,7 @@ WC_OMIT_FRAME_POINTER static void sp_256_mul_d_8(sp_digit* r, const sp_digit* a,
         "mov	r5, #0\n\t"
         "mov	r9, #4\n\t"
         "\n"
-    "L_sp_256_mul_d_8_word_%=: \n\t"
+    "L_sp_256_mul_d_8_word_%=:\n\t"
         /* A[i] * B */
         "ldr	r8, [%[a], r9]\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
@@ -78480,12 +78480,12 @@ WC_OMIT_FRAME_POINTER static void sp_256_mul_d_8(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static void sp_256_mul_d_8(sp_digit* r, const sp_digit* a,
     sp_digit b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register sp_digit b asm ("r2") = (sp_digit)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register sp_digit b __asm__ ("r2") = (sp_digit)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -78770,12 +78770,12 @@ WC_OMIT_FRAME_POINTER static sp_digit div_256_word_8(sp_digit d1_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit div_256_word_8(sp_digit d1, sp_digit d0,
     sp_digit div)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit d1 asm ("r0") = (sp_digit)d1_p;
-    register sp_digit d0 asm ("r1") = (sp_digit)d0_p;
-    register sp_digit div asm ("r2") = (sp_digit)div_p;
+    register sp_digit d1 __asm__ ("r0") = (sp_digit)d1_p;
+    register sp_digit d0 __asm__ ("r1") = (sp_digit)d0_p;
+    register sp_digit div __asm__ ("r2") = (sp_digit)div_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -78842,12 +78842,12 @@ WC_OMIT_FRAME_POINTER static sp_digit div_256_word_8(sp_digit d1_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit div_256_word_8(sp_digit d1, sp_digit d0,
     sp_digit div)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit d1 asm ("r0") = (sp_digit)d1_p;
-    register sp_digit d0 asm ("r1") = (sp_digit)d0_p;
-    register sp_digit div asm ("r2") = (sp_digit)div_p;
+    register sp_digit d1 __asm__ ("r0") = (sp_digit)d1_p;
+    register sp_digit d0 __asm__ ("r1") = (sp_digit)d0_p;
+    register sp_digit div __asm__ ("r2") = (sp_digit)div_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -78865,7 +78865,7 @@ WC_OMIT_FRAME_POINTER static sp_digit div_256_word_8(sp_digit d1, sp_digit d0,
         /* Next 30 bits */
         "mov	r12, #29\n\t"
         "\n"
-    "L_div_256_word_8_bit_%=: \n\t"
+    "L_div_256_word_8_bit_%=:\n\t"
         "lsls	r4, r4, #1\n\t"
         "adc	r5, r5, r5\n\t"
         "subs	r6, lr, r5\n\t"
@@ -79603,19 +79603,19 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_256_sub_8(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_256_sub_8(sp_digit* r,
     const sp_digit* a, const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
         "mov	r12, #0\n\t"
         "add	lr, %[a], #32\n\t"
         "\n"
-    "L_sp_256_sub_8_word_%=: \n\t"
+    "L_sp_256_sub_8_word_%=:\n\t"
         "rsbs	r12, r12, #0\n\t"
         "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
         "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
@@ -79654,12 +79654,12 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_256_sub_8(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_256_sub_8(sp_digit* r,
     const sp_digit* a, const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -79697,11 +79697,11 @@ WC_OMIT_FRAME_POINTER static void sp_256_rshift1_8(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static void sp_256_rshift1_8(sp_digit* r,
     const sp_digit* a)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -79793,12 +79793,12 @@ WC_OMIT_FRAME_POINTER static void sp_256_div2_mod_8(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static void sp_256_div2_mod_8(sp_digit* r,
     const sp_digit* a, const sp_digit* m)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* m asm ("r2") = (const sp_digit*)m_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* m __asm__ ("r2") = (const sp_digit*)m_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -79822,7 +79822,7 @@ WC_OMIT_FRAME_POINTER static void sp_256_div2_mod_8(sp_digit* r,
         "adc	r3, r12, r12\n\t"
         "b	L_sp_256_div2_mod_8_div2_%=\n\t"
         "\n"
-    "L_sp_256_div2_mod_8_even_%=: \n\t"
+    "L_sp_256_div2_mod_8_even_%=:\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
         "ldr	r4, [%[a], #12]\n\t"
         "ldr	r5, [%[a], #16]\n\t"
@@ -79836,7 +79836,7 @@ WC_OMIT_FRAME_POINTER static void sp_256_div2_mod_8(sp_digit* r,
         "ldrd	r6, r7, [%[a], #20]\n\t"
 #endif
         "\n"
-    "L_sp_256_div2_mod_8_div2_%=: \n\t"
+    "L_sp_256_div2_mod_8_div2_%=:\n\t"
         "lsr	r8, r4, #1\n\t"
         "and	r4, r4, #1\n\t"
         "lsr	r9, r5, #1\n\t"
@@ -79882,7 +79882,7 @@ WC_OMIT_FRAME_POINTER static void sp_256_div2_mod_8(sp_digit* r,
 }
 
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
-static const byte L_sp_256_num_bits_8_table[] = {
+XALIGNED(4) static const word8 L_sp_256_num_bits_8_table[] = {
     0x00, 0x01, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03,
     0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04,
     0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
@@ -79921,15 +79921,15 @@ static const byte L_sp_256_num_bits_8_table[] = {
 WC_OMIT_FRAME_POINTER static int sp_256_num_bits_8(const sp_digit* a_p)
 #else
 WC_OMIT_FRAME_POINTER static int sp_256_num_bits_8(const sp_digit* a)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register const sp_digit* a asm ("r0") = (const sp_digit*)a_p;
-    register byte* L_sp_256_num_bits_8_table_c asm ("r1") =
-        (byte*)&L_sp_256_num_bits_8_table;
+    register const sp_digit* a __asm__ ("r0") = (const sp_digit*)a_p;
+    register word8* L_sp_256_num_bits_8_table_c __asm__ ("r1") =
+        (word8*)&L_sp_256_num_bits_8_table;
 #else
-    register byte* L_sp_256_num_bits_8_table_c =
-        (byte*)&L_sp_256_num_bits_8_table;
+    register word8* L_sp_256_num_bits_8_table_c =
+        (word8*)&L_sp_256_num_bits_8_table;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -79945,7 +79945,7 @@ WC_OMIT_FRAME_POINTER static int sp_256_num_bits_8(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_256_num_bits_8_9_%=\n\t"
         "\n"
-    "L_sp_256_num_bits_8_7_3_%=: \n\t"
+    "L_sp_256_num_bits_8_7_3_%=:\n\t"
         "lsr	r3, r1, #16\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -79955,7 +79955,7 @@ WC_OMIT_FRAME_POINTER static int sp_256_num_bits_8(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_256_num_bits_8_9_%=\n\t"
         "\n"
-    "L_sp_256_num_bits_8_7_2_%=: \n\t"
+    "L_sp_256_num_bits_8_7_2_%=:\n\t"
         "lsr	r3, r1, #8\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -79965,14 +79965,14 @@ WC_OMIT_FRAME_POINTER static int sp_256_num_bits_8(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_256_num_bits_8_9_%=\n\t"
         "\n"
-    "L_sp_256_num_bits_8_7_1_%=: \n\t"
+    "L_sp_256_num_bits_8_7_1_%=:\n\t"
         "and	r3, r1, #0xff\n\t"
         "mov	r2, #0xe0\n\t"
         "ldrb	r12, [lr, r3]\n\t"
         "add	r12, r2, r12\n\t"
         "b	L_sp_256_num_bits_8_9_%=\n\t"
         "\n"
-    "L_sp_256_num_bits_8_7_%=: \n\t"
+    "L_sp_256_num_bits_8_7_%=:\n\t"
         "ldr	r1, [%[a], #24]\n\t"
         "cmp	r1, #0\n\t"
         "beq	L_sp_256_num_bits_8_6_%=\n\t"
@@ -79984,7 +79984,7 @@ WC_OMIT_FRAME_POINTER static int sp_256_num_bits_8(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_256_num_bits_8_9_%=\n\t"
         "\n"
-    "L_sp_256_num_bits_8_6_3_%=: \n\t"
+    "L_sp_256_num_bits_8_6_3_%=:\n\t"
         "lsr	r3, r1, #16\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -79994,7 +79994,7 @@ WC_OMIT_FRAME_POINTER static int sp_256_num_bits_8(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_256_num_bits_8_9_%=\n\t"
         "\n"
-    "L_sp_256_num_bits_8_6_2_%=: \n\t"
+    "L_sp_256_num_bits_8_6_2_%=:\n\t"
         "lsr	r3, r1, #8\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -80004,14 +80004,14 @@ WC_OMIT_FRAME_POINTER static int sp_256_num_bits_8(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_256_num_bits_8_9_%=\n\t"
         "\n"
-    "L_sp_256_num_bits_8_6_1_%=: \n\t"
+    "L_sp_256_num_bits_8_6_1_%=:\n\t"
         "and	r3, r1, #0xff\n\t"
         "mov	r2, #0xc0\n\t"
         "ldrb	r12, [lr, r3]\n\t"
         "add	r12, r2, r12\n\t"
         "b	L_sp_256_num_bits_8_9_%=\n\t"
         "\n"
-    "L_sp_256_num_bits_8_6_%=: \n\t"
+    "L_sp_256_num_bits_8_6_%=:\n\t"
         "ldr	r1, [%[a], #20]\n\t"
         "cmp	r1, #0\n\t"
         "beq	L_sp_256_num_bits_8_5_%=\n\t"
@@ -80023,7 +80023,7 @@ WC_OMIT_FRAME_POINTER static int sp_256_num_bits_8(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_256_num_bits_8_9_%=\n\t"
         "\n"
-    "L_sp_256_num_bits_8_5_3_%=: \n\t"
+    "L_sp_256_num_bits_8_5_3_%=:\n\t"
         "lsr	r3, r1, #16\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -80033,7 +80033,7 @@ WC_OMIT_FRAME_POINTER static int sp_256_num_bits_8(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_256_num_bits_8_9_%=\n\t"
         "\n"
-    "L_sp_256_num_bits_8_5_2_%=: \n\t"
+    "L_sp_256_num_bits_8_5_2_%=:\n\t"
         "lsr	r3, r1, #8\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -80043,14 +80043,14 @@ WC_OMIT_FRAME_POINTER static int sp_256_num_bits_8(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_256_num_bits_8_9_%=\n\t"
         "\n"
-    "L_sp_256_num_bits_8_5_1_%=: \n\t"
+    "L_sp_256_num_bits_8_5_1_%=:\n\t"
         "and	r3, r1, #0xff\n\t"
         "mov	r2, #0xa0\n\t"
         "ldrb	r12, [lr, r3]\n\t"
         "add	r12, r2, r12\n\t"
         "b	L_sp_256_num_bits_8_9_%=\n\t"
         "\n"
-    "L_sp_256_num_bits_8_5_%=: \n\t"
+    "L_sp_256_num_bits_8_5_%=:\n\t"
         "ldr	r1, [%[a], #16]\n\t"
         "cmp	r1, #0\n\t"
         "beq	L_sp_256_num_bits_8_4_%=\n\t"
@@ -80062,7 +80062,7 @@ WC_OMIT_FRAME_POINTER static int sp_256_num_bits_8(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_256_num_bits_8_9_%=\n\t"
         "\n"
-    "L_sp_256_num_bits_8_4_3_%=: \n\t"
+    "L_sp_256_num_bits_8_4_3_%=:\n\t"
         "lsr	r3, r1, #16\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -80072,7 +80072,7 @@ WC_OMIT_FRAME_POINTER static int sp_256_num_bits_8(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_256_num_bits_8_9_%=\n\t"
         "\n"
-    "L_sp_256_num_bits_8_4_2_%=: \n\t"
+    "L_sp_256_num_bits_8_4_2_%=:\n\t"
         "lsr	r3, r1, #8\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -80082,14 +80082,14 @@ WC_OMIT_FRAME_POINTER static int sp_256_num_bits_8(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_256_num_bits_8_9_%=\n\t"
         "\n"
-    "L_sp_256_num_bits_8_4_1_%=: \n\t"
+    "L_sp_256_num_bits_8_4_1_%=:\n\t"
         "and	r3, r1, #0xff\n\t"
         "mov	r2, #0x80\n\t"
         "ldrb	r12, [lr, r3]\n\t"
         "add	r12, r2, r12\n\t"
         "b	L_sp_256_num_bits_8_9_%=\n\t"
         "\n"
-    "L_sp_256_num_bits_8_4_%=: \n\t"
+    "L_sp_256_num_bits_8_4_%=:\n\t"
         "ldr	r1, [%[a], #12]\n\t"
         "cmp	r1, #0\n\t"
         "beq	L_sp_256_num_bits_8_3_%=\n\t"
@@ -80101,7 +80101,7 @@ WC_OMIT_FRAME_POINTER static int sp_256_num_bits_8(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_256_num_bits_8_9_%=\n\t"
         "\n"
-    "L_sp_256_num_bits_8_3_3_%=: \n\t"
+    "L_sp_256_num_bits_8_3_3_%=:\n\t"
         "lsr	r3, r1, #16\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -80111,7 +80111,7 @@ WC_OMIT_FRAME_POINTER static int sp_256_num_bits_8(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_256_num_bits_8_9_%=\n\t"
         "\n"
-    "L_sp_256_num_bits_8_3_2_%=: \n\t"
+    "L_sp_256_num_bits_8_3_2_%=:\n\t"
         "lsr	r3, r1, #8\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -80121,14 +80121,14 @@ WC_OMIT_FRAME_POINTER static int sp_256_num_bits_8(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_256_num_bits_8_9_%=\n\t"
         "\n"
-    "L_sp_256_num_bits_8_3_1_%=: \n\t"
+    "L_sp_256_num_bits_8_3_1_%=:\n\t"
         "and	r3, r1, #0xff\n\t"
         "mov	r2, #0x60\n\t"
         "ldrb	r12, [lr, r3]\n\t"
         "add	r12, r2, r12\n\t"
         "b	L_sp_256_num_bits_8_9_%=\n\t"
         "\n"
-    "L_sp_256_num_bits_8_3_%=: \n\t"
+    "L_sp_256_num_bits_8_3_%=:\n\t"
         "ldr	r1, [%[a], #8]\n\t"
         "cmp	r1, #0\n\t"
         "beq	L_sp_256_num_bits_8_2_%=\n\t"
@@ -80140,7 +80140,7 @@ WC_OMIT_FRAME_POINTER static int sp_256_num_bits_8(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_256_num_bits_8_9_%=\n\t"
         "\n"
-    "L_sp_256_num_bits_8_2_3_%=: \n\t"
+    "L_sp_256_num_bits_8_2_3_%=:\n\t"
         "lsr	r3, r1, #16\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -80150,7 +80150,7 @@ WC_OMIT_FRAME_POINTER static int sp_256_num_bits_8(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_256_num_bits_8_9_%=\n\t"
         "\n"
-    "L_sp_256_num_bits_8_2_2_%=: \n\t"
+    "L_sp_256_num_bits_8_2_2_%=:\n\t"
         "lsr	r3, r1, #8\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -80160,14 +80160,14 @@ WC_OMIT_FRAME_POINTER static int sp_256_num_bits_8(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_256_num_bits_8_9_%=\n\t"
         "\n"
-    "L_sp_256_num_bits_8_2_1_%=: \n\t"
+    "L_sp_256_num_bits_8_2_1_%=:\n\t"
         "and	r3, r1, #0xff\n\t"
         "mov	r2, #0x40\n\t"
         "ldrb	r12, [lr, r3]\n\t"
         "add	r12, r2, r12\n\t"
         "b	L_sp_256_num_bits_8_9_%=\n\t"
         "\n"
-    "L_sp_256_num_bits_8_2_%=: \n\t"
+    "L_sp_256_num_bits_8_2_%=:\n\t"
         "ldr	r1, [%[a], #4]\n\t"
         "cmp	r1, #0\n\t"
         "beq	L_sp_256_num_bits_8_1_%=\n\t"
@@ -80179,7 +80179,7 @@ WC_OMIT_FRAME_POINTER static int sp_256_num_bits_8(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_256_num_bits_8_9_%=\n\t"
         "\n"
-    "L_sp_256_num_bits_8_1_3_%=: \n\t"
+    "L_sp_256_num_bits_8_1_3_%=:\n\t"
         "lsr	r3, r1, #16\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -80189,7 +80189,7 @@ WC_OMIT_FRAME_POINTER static int sp_256_num_bits_8(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_256_num_bits_8_9_%=\n\t"
         "\n"
-    "L_sp_256_num_bits_8_1_2_%=: \n\t"
+    "L_sp_256_num_bits_8_1_2_%=:\n\t"
         "lsr	r3, r1, #8\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -80199,14 +80199,14 @@ WC_OMIT_FRAME_POINTER static int sp_256_num_bits_8(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_256_num_bits_8_9_%=\n\t"
         "\n"
-    "L_sp_256_num_bits_8_1_1_%=: \n\t"
+    "L_sp_256_num_bits_8_1_1_%=:\n\t"
         "and	r3, r1, #0xff\n\t"
         "mov	r2, #32\n\t"
         "ldrb	r12, [lr, r3]\n\t"
         "add	r12, r2, r12\n\t"
         "b	L_sp_256_num_bits_8_9_%=\n\t"
         "\n"
-    "L_sp_256_num_bits_8_1_%=: \n\t"
+    "L_sp_256_num_bits_8_1_%=:\n\t"
         "ldr	r1, [%[a]]\n\t"
         "lsr	r3, r1, #24\n\t"
         "cmp	r3, #0\n\t"
@@ -80216,7 +80216,7 @@ WC_OMIT_FRAME_POINTER static int sp_256_num_bits_8(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_256_num_bits_8_9_%=\n\t"
         "\n"
-    "L_sp_256_num_bits_8_0_3_%=: \n\t"
+    "L_sp_256_num_bits_8_0_3_%=:\n\t"
         "lsr	r3, r1, #16\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -80226,7 +80226,7 @@ WC_OMIT_FRAME_POINTER static int sp_256_num_bits_8(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_256_num_bits_8_9_%=\n\t"
         "\n"
-    "L_sp_256_num_bits_8_0_2_%=: \n\t"
+    "L_sp_256_num_bits_8_0_2_%=:\n\t"
         "lsr	r3, r1, #8\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -80236,11 +80236,11 @@ WC_OMIT_FRAME_POINTER static int sp_256_num_bits_8(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_256_num_bits_8_9_%=\n\t"
         "\n"
-    "L_sp_256_num_bits_8_0_1_%=: \n\t"
+    "L_sp_256_num_bits_8_0_1_%=:\n\t"
         "and	r3, r1, #0xff\n\t"
         "ldrb	r12, [lr, r3]\n\t"
         "\n"
-    "L_sp_256_num_bits_8_9_%=: \n\t"
+    "L_sp_256_num_bits_8_9_%=:\n\t"
         "mov	%[a], r12\n\t"
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a),
@@ -80261,10 +80261,10 @@ WC_OMIT_FRAME_POINTER static int sp_256_num_bits_8(const sp_digit* a)
 WC_OMIT_FRAME_POINTER static int sp_256_num_bits_8(const sp_digit* a_p)
 #else
 WC_OMIT_FRAME_POINTER static int sp_256_num_bits_8(const sp_digit* a)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register const sp_digit* a asm ("r0") = (const sp_digit*)a_p;
+    register const sp_digit* a __asm__ ("r0") = (const sp_digit*)a_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -80276,7 +80276,7 @@ WC_OMIT_FRAME_POINTER static int sp_256_num_bits_8(const sp_digit* a)
         "sub	r12, r2, r12\n\t"
         "b	L_sp_256_num_bits_8_9_%=\n\t"
         "\n"
-    "L_sp_256_num_bits_8_7_%=: \n\t"
+    "L_sp_256_num_bits_8_7_%=:\n\t"
         "ldr	r1, [%[a], #24]\n\t"
         "cmp	r1, #0\n\t"
         "beq	L_sp_256_num_bits_8_6_%=\n\t"
@@ -80285,7 +80285,7 @@ WC_OMIT_FRAME_POINTER static int sp_256_num_bits_8(const sp_digit* a)
         "sub	r12, r2, r12\n\t"
         "b	L_sp_256_num_bits_8_9_%=\n\t"
         "\n"
-    "L_sp_256_num_bits_8_6_%=: \n\t"
+    "L_sp_256_num_bits_8_6_%=:\n\t"
         "ldr	r1, [%[a], #20]\n\t"
         "cmp	r1, #0\n\t"
         "beq	L_sp_256_num_bits_8_5_%=\n\t"
@@ -80294,7 +80294,7 @@ WC_OMIT_FRAME_POINTER static int sp_256_num_bits_8(const sp_digit* a)
         "sub	r12, r2, r12\n\t"
         "b	L_sp_256_num_bits_8_9_%=\n\t"
         "\n"
-    "L_sp_256_num_bits_8_5_%=: \n\t"
+    "L_sp_256_num_bits_8_5_%=:\n\t"
         "ldr	r1, [%[a], #16]\n\t"
         "cmp	r1, #0\n\t"
         "beq	L_sp_256_num_bits_8_4_%=\n\t"
@@ -80303,7 +80303,7 @@ WC_OMIT_FRAME_POINTER static int sp_256_num_bits_8(const sp_digit* a)
         "sub	r12, r2, r12\n\t"
         "b	L_sp_256_num_bits_8_9_%=\n\t"
         "\n"
-    "L_sp_256_num_bits_8_4_%=: \n\t"
+    "L_sp_256_num_bits_8_4_%=:\n\t"
         "ldr	r1, [%[a], #12]\n\t"
         "cmp	r1, #0\n\t"
         "beq	L_sp_256_num_bits_8_3_%=\n\t"
@@ -80312,7 +80312,7 @@ WC_OMIT_FRAME_POINTER static int sp_256_num_bits_8(const sp_digit* a)
         "sub	r12, r2, r12\n\t"
         "b	L_sp_256_num_bits_8_9_%=\n\t"
         "\n"
-    "L_sp_256_num_bits_8_3_%=: \n\t"
+    "L_sp_256_num_bits_8_3_%=:\n\t"
         "ldr	r1, [%[a], #8]\n\t"
         "cmp	r1, #0\n\t"
         "beq	L_sp_256_num_bits_8_2_%=\n\t"
@@ -80321,7 +80321,7 @@ WC_OMIT_FRAME_POINTER static int sp_256_num_bits_8(const sp_digit* a)
         "sub	r12, r2, r12\n\t"
         "b	L_sp_256_num_bits_8_9_%=\n\t"
         "\n"
-    "L_sp_256_num_bits_8_2_%=: \n\t"
+    "L_sp_256_num_bits_8_2_%=:\n\t"
         "ldr	r1, [%[a], #4]\n\t"
         "cmp	r1, #0\n\t"
         "beq	L_sp_256_num_bits_8_1_%=\n\t"
@@ -80330,13 +80330,13 @@ WC_OMIT_FRAME_POINTER static int sp_256_num_bits_8(const sp_digit* a)
         "sub	r12, r2, r12\n\t"
         "b	L_sp_256_num_bits_8_9_%=\n\t"
         "\n"
-    "L_sp_256_num_bits_8_1_%=: \n\t"
+    "L_sp_256_num_bits_8_1_%=:\n\t"
         "ldr	r1, [%[a]]\n\t"
         "mov	r2, #32\n\t"
         "clz	r12, r1\n\t"
         "sub	r12, r2, r12\n\t"
         "\n"
-    "L_sp_256_num_bits_8_9_%=: \n\t"
+    "L_sp_256_num_bits_8_9_%=:\n\t"
         "mov	%[a], r12\n\t"
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a)
@@ -81292,12 +81292,12 @@ WC_OMIT_FRAME_POINTER static void sp_384_mul_12(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static void sp_384_mul_12(sp_digit* r, const sp_digit* a,
     const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -81310,13 +81310,13 @@ WC_OMIT_FRAME_POINTER static void sp_384_mul_12(sp_digit* r, const sp_digit* a,
         "mov	r8, #0\n\t"
         "mov	r5, #4\n\t"
         "\n"
-    "L_sp_384_mul_12_outer_%=: \n\t"
+    "L_sp_384_mul_12_outer_%=:\n\t"
         "subs	r3, r5, #44\n\t"
         "it	cc\n\t"
         "movcc	r3, #0\n\t"
         "sub	r4, r5, r3\n\t"
         "\n"
-    "L_sp_384_mul_12_inner_%=: \n\t"
+    "L_sp_384_mul_12_inner_%=:\n\t"
         "ldr	lr, [%[a], r3]\n\t"
         "ldr	r11, [%[b], r4]\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
@@ -81434,7 +81434,7 @@ WC_OMIT_FRAME_POINTER static void sp_384_mul_12(sp_digit* r, const sp_digit* a,
         "adc	r8, r8, #0\n\t"
 #endif
         "\n"
-    "L_sp_384_mul_12_inner_done_%=: \n\t"
+    "L_sp_384_mul_12_inner_done_%=:\n\t"
         "str	r6, [sp, r5]\n\t"
         "mov	r6, r7\n\t"
         "mov	r7, r8\n\t"
@@ -81476,7 +81476,7 @@ WC_OMIT_FRAME_POINTER static void sp_384_mul_12(sp_digit* r, const sp_digit* a,
         "add	r5, r5, #4\n\t"
         "str	r7, [sp, r5]\n\t"
         "\n"
-    "L_sp_384_mul_12_store_%=: \n\t"
+    "L_sp_384_mul_12_store_%=:\n\t"
         "ldm	sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
         "stm	%[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
         "subs	r5, r5, #32\n\t"
@@ -81506,12 +81506,12 @@ WC_OMIT_FRAME_POINTER static void sp_384_mul_12(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static void sp_384_mul_12(sp_digit* r, const sp_digit* a,
     const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -87016,11 +87016,11 @@ WC_OMIT_FRAME_POINTER static void sp_384_sqr_12(sp_digit* r_p,
     const sp_digit* a_p)
 #else
 WC_OMIT_FRAME_POINTER static void sp_384_sqr_12(sp_digit* r, const sp_digit* a)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -87032,13 +87032,13 @@ WC_OMIT_FRAME_POINTER static void sp_384_sqr_12(sp_digit* r, const sp_digit* a)
         "mov	r8, #0\n\t"
         "mov	r5, #4\n\t"
         "\n"
-    "L_sp_384_sqr_12_outer_%=: \n\t"
+    "L_sp_384_sqr_12_outer_%=:\n\t"
         "subs	r3, r5, #44\n\t"
         "it	cc\n\t"
         "movcc	r3, #0\n\t"
         "sub	r4, r5, r3\n\t"
         "\n"
-    "L_sp_384_sqr_12_inner_%=: \n\t"
+    "L_sp_384_sqr_12_inner_%=:\n\t"
         "ldr	lr, [%[a], r3]\n\t"
         "ldr	r11, [%[a], r4]\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
@@ -87123,7 +87123,7 @@ WC_OMIT_FRAME_POINTER static void sp_384_sqr_12(sp_digit* r, const sp_digit* a)
         "adc	r8, r8, #0\n\t"
 #endif
         "\n"
-    "L_sp_384_sqr_12_inner_done_%=: \n\t"
+    "L_sp_384_sqr_12_inner_done_%=:\n\t"
         "str	r6, [sp, r5]\n\t"
         "mov	r6, r7\n\t"
         "mov	r7, r8\n\t"
@@ -87159,7 +87159,7 @@ WC_OMIT_FRAME_POINTER static void sp_384_sqr_12(sp_digit* r, const sp_digit* a)
         "add	r5, r5, #4\n\t"
         "str	r7, [sp, r5]\n\t"
         "\n"
-    "L_sp_384_sqr_12_store_%=: \n\t"
+    "L_sp_384_sqr_12_store_%=:\n\t"
         "ldm	sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
         "stm	%[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
         "subs	r5, r5, #32\n\t"
@@ -87187,11 +87187,11 @@ WC_OMIT_FRAME_POINTER static void sp_384_sqr_12(sp_digit* r_p,
     const sp_digit* a_p)
 #else
 WC_OMIT_FRAME_POINTER static void sp_384_sqr_12(sp_digit* r, const sp_digit* a)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -90266,19 +90266,19 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_384_add_12(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_384_add_12(sp_digit* r,
     const sp_digit* a, const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
         "mov	r3, #0\n\t"
         "add	r12, %[a], #48\n\t"
         "\n"
-    "L_sp_384_add_12_word_%=: \n\t"
+    "L_sp_384_add_12_word_%=:\n\t"
         "adds	r3, r3, #-1\n\t"
         "ldm	%[a]!, {r4, r5, r6, r7}\n\t"
         "ldm	%[b]!, {r8, r9, r10, r11}\n\t"
@@ -90318,12 +90318,12 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_384_add_12(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_384_add_12(sp_digit* r,
     const sp_digit* a, const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -90662,13 +90662,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_384_cond_sub_12(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_384_cond_sub_12(sp_digit* r,
     const sp_digit* a, const sp_digit* b, sp_digit m)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
-    register sp_digit m asm ("r3") = (sp_digit)m_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
+    register sp_digit m __asm__ ("r3") = (sp_digit)m_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -90676,7 +90676,7 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_384_cond_sub_12(sp_digit* r,
         "mov	r12, #0\n\t"
         "mov	lr, #0\n\t"
         "\n"
-    "L_sp_384_cond_sub_12_words_%=: \n\t"
+    "L_sp_384_cond_sub_12_words_%=:\n\t"
         "subs	r12, r6, r12\n\t"
         "ldr	r4, [%[a], lr]\n\t"
         "ldr	r5, [%[b], lr]\n\t"
@@ -90715,13 +90715,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_384_cond_sub_12(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_384_cond_sub_12(sp_digit* r,
     const sp_digit* a, const sp_digit* b, sp_digit m)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
-    register sp_digit m asm ("r3") = (sp_digit)m_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
+    register sp_digit m __asm__ ("r3") = (sp_digit)m_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -90797,12 +90797,12 @@ WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_384_mont_reduce_12(
 #else
 WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_384_mont_reduce_12(sp_digit* a,
     const sp_digit* m, sp_digit mp)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* a asm ("r0") = (sp_digit*)a_p;
-    register const sp_digit* m asm ("r1") = (const sp_digit*)m_p;
-    register sp_digit mp asm ("r2") = (sp_digit)mp_p;
+    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
+    register const sp_digit* m __asm__ ("r1") = (const sp_digit*)m_p;
+    register sp_digit mp __asm__ ("r2") = (sp_digit)mp_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -90815,7 +90815,7 @@ WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_384_mont_reduce_12(sp_digit* a,
         "ldr	r12, [%[a]]\n\t"
         "ldr	lr, [%[a], #4]\n\t"
         "\n"
-    "L_sp_384_mont_reduce_12_word_%=: \n\t"
+    "L_sp_384_mont_reduce_12_word_%=:\n\t"
         /* mu = a[i] * mp */
         "mul	r8, %[mp], r12\n\t"
         /* a[i+0] += m[0] * mu */
@@ -91211,12 +91211,12 @@ WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_384_mont_reduce_12(
 #else
 WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_384_mont_reduce_12(sp_digit* a,
     const sp_digit* m, sp_digit mp)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* a asm ("r0") = (sp_digit*)a_p;
-    register const sp_digit* m asm ("r1") = (const sp_digit*)m_p;
-    register sp_digit mp asm ("r2") = (sp_digit)mp_p;
+    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
+    register const sp_digit* m __asm__ ("r1") = (const sp_digit*)m_p;
+    register sp_digit mp __asm__ ("r2") = (sp_digit)mp_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -91227,7 +91227,7 @@ WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_384_mont_reduce_12(sp_digit* a,
         "ldr	r12, [%[a]]\n\t"
         "ldr	lr, [%[a], #4]\n\t"
         "\n"
-    "L_sp_384_mont_reduce_12_word_%=: \n\t"
+    "L_sp_384_mont_reduce_12_word_%=:\n\t"
         /* mu = a[i] * mp */
         "mul	r8, %[mp], r12\n\t"
         /* a[i+0] += m[0] * mu */
@@ -91360,12 +91360,12 @@ WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_384_mont_reduce_12(
 #else
 WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_384_mont_reduce_12(sp_digit* a,
     const sp_digit* m, sp_digit mp)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* a asm ("r0") = (sp_digit*)a_p;
-    register const sp_digit* m asm ("r1") = (const sp_digit*)m_p;
-    register sp_digit mp asm ("r2") = (sp_digit)mp_p;
+    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
+    register const sp_digit* m __asm__ ("r1") = (const sp_digit*)m_p;
+    register sp_digit mp __asm__ ("r2") = (sp_digit)mp_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -91378,7 +91378,7 @@ WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_384_mont_reduce_12(sp_digit* a,
         "ldr	r7, [%[a], #12]\n\t"
         "ldr	r8, [%[a], #16]\n\t"
         "\n"
-    "L_sp_384_mont_reduce_12_word_%=: \n\t"
+    "L_sp_384_mont_reduce_12_word_%=:\n\t"
         /* mu = a[i] * mp */
         "mul	r11, %[mp], r4\n\t"
         /* a[i+0] += m[0] * mu */
@@ -91622,11 +91622,11 @@ WC_OMIT_FRAME_POINTER static sp_int32 sp_384_cmp_12(const sp_digit* a_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_int32 sp_384_cmp_12(const sp_digit* a,
     const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register const sp_digit* a asm ("r0") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r1") = (const sp_digit*)b_p;
+    register const sp_digit* a __asm__ ("r0") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r1") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -91637,7 +91637,7 @@ WC_OMIT_FRAME_POINTER static sp_int32 sp_384_cmp_12(const sp_digit* a,
 #ifdef WOLFSSL_SP_SMALL
         "mov	r4, #44\n\t"
         "\n"
-    "L_sp_384_cmp_12_words_%=: \n\t"
+    "L_sp_384_cmp_12_words_%=:\n\t"
         "ldr	r12, [%[a], r4]\n\t"
         "ldr	lr, [%[b], r4]\n\t"
         "and	r12, r12, r3\n\t"
@@ -91859,13 +91859,13 @@ WC_OMIT_FRAME_POINTER static void sp_384_mont_add_12(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static void sp_384_mont_add_12(sp_digit* r,
     const sp_digit* a, const sp_digit* b, const sp_digit* m)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
-    register const sp_digit* m asm ("r3") = (const sp_digit*)m_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
+    register const sp_digit* m __asm__ ("r3") = (const sp_digit*)m_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -91959,12 +91959,12 @@ WC_OMIT_FRAME_POINTER static void sp_384_mont_dbl_12(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static void sp_384_mont_dbl_12(sp_digit* r,
     const sp_digit* a, const sp_digit* m)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* m asm ("r2") = (const sp_digit*)m_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* m __asm__ ("r2") = (const sp_digit*)m_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -92048,12 +92048,12 @@ WC_OMIT_FRAME_POINTER static void sp_384_mont_tpl_12(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static void sp_384_mont_tpl_12(sp_digit* r,
     const sp_digit* a, const sp_digit* m)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* m asm ("r2") = (const sp_digit*)m_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* m __asm__ ("r2") = (const sp_digit*)m_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -92203,12 +92203,12 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_384_sub_12(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_384_sub_12(sp_digit* r,
     const sp_digit* a, const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -92262,13 +92262,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_384_cond_add_12(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_384_cond_add_12(sp_digit* r,
     const sp_digit* a, const sp_digit* b, sp_digit m)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
-    register sp_digit m asm ("r3") = (sp_digit)m_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
+    register sp_digit m __asm__ ("r3") = (sp_digit)m_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -92276,7 +92276,7 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_384_cond_add_12(sp_digit* r,
         "mov	r6, #0\n\t"
         "mov	r12, #0\n\t"
         "\n"
-    "L_sp_384_cond_add_12_words_%=: \n\t"
+    "L_sp_384_cond_add_12_words_%=:\n\t"
         "adds	lr, lr, #-1\n\t"
         "ldr	r4, [%[a], r12]\n\t"
         "ldr	r5, [%[b], r12]\n\t"
@@ -92315,13 +92315,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_384_cond_add_12(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_384_cond_add_12(sp_digit* r,
     const sp_digit* a, const sp_digit* b, sp_digit m)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
-    register sp_digit m asm ("r3") = (sp_digit)m_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
+    register sp_digit m __asm__ ("r3") = (sp_digit)m_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -92395,13 +92395,13 @@ WC_OMIT_FRAME_POINTER static void sp_384_mont_sub_12(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static void sp_384_mont_sub_12(sp_digit* r,
     const sp_digit* a, const sp_digit* b, const sp_digit* m)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
-    register const sp_digit* m asm ("r3") = (const sp_digit*)m_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
+    register const sp_digit* m __asm__ ("r3") = (const sp_digit*)m_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -92490,11 +92490,11 @@ WC_OMIT_FRAME_POINTER static void sp_384_rshift1_12(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static void sp_384_rshift1_12(sp_digit* r,
     const sp_digit* a)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -95935,10 +95935,10 @@ int sp_ecc_mulmod_base_add_384(const mp_int* km, const ecc_point* am,
 WC_OMIT_FRAME_POINTER static void sp_384_add_one_12(sp_digit* a_p)
 #else
 WC_OMIT_FRAME_POINTER static void sp_384_add_one_12(sp_digit* a)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* a asm ("r0") = (sp_digit*)a_p;
+    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -96326,18 +96326,18 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_384_sub_in_place_12(sp_digit* a_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_384_sub_in_place_12(sp_digit* a,
     const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* a asm ("r0") = (sp_digit*)a_p;
-    register const sp_digit* b asm ("r1") = (const sp_digit*)b_p;
+    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r1") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
         "mov	r12, #0\n\t"
         "add	lr, %[a], #48\n\t"
         "\n"
-    "L_sp_384_sub_in_place_12_word_%=: \n\t"
+    "L_sp_384_sub_in_place_12_word_%=:\n\t"
         "rsbs	r12, r12, #0\n\t"
         "ldm	%[a], {r2, r3, r4, r5}\n\t"
         "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
@@ -96375,11 +96375,11 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_384_sub_in_place_12(sp_digit* a_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_384_sub_in_place_12(sp_digit* a,
     const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* a asm ("r0") = (sp_digit*)a_p;
-    register const sp_digit* b asm ("r1") = (const sp_digit*)b_p;
+    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r1") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -96431,12 +96431,12 @@ WC_OMIT_FRAME_POINTER static void sp_384_mul_d_12(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static void sp_384_mul_d_12(sp_digit* r,
     const sp_digit* a, sp_digit b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register sp_digit b asm ("r2") = (sp_digit)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register sp_digit b __asm__ ("r2") = (sp_digit)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -96472,7 +96472,7 @@ WC_OMIT_FRAME_POINTER static void sp_384_mul_d_12(sp_digit* r,
         "mov	r5, #0\n\t"
         "mov	r9, #4\n\t"
         "\n"
-    "L_sp_384_mul_d_12_word_%=: \n\t"
+    "L_sp_384_mul_d_12_word_%=:\n\t"
         /* A[i] * B */
         "ldr	r8, [%[a], r9]\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
@@ -96542,12 +96542,12 @@ WC_OMIT_FRAME_POINTER static void sp_384_mul_d_12(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static void sp_384_mul_d_12(sp_digit* r,
     const sp_digit* a, sp_digit b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register sp_digit b asm ("r2") = (sp_digit)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register sp_digit b __asm__ ("r2") = (sp_digit)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -96960,12 +96960,12 @@ WC_OMIT_FRAME_POINTER static sp_digit div_384_word_12(sp_digit d1_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit div_384_word_12(sp_digit d1, sp_digit d0,
     sp_digit div)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit d1 asm ("r0") = (sp_digit)d1_p;
-    register sp_digit d0 asm ("r1") = (sp_digit)d0_p;
-    register sp_digit div asm ("r2") = (sp_digit)div_p;
+    register sp_digit d1 __asm__ ("r0") = (sp_digit)d1_p;
+    register sp_digit d0 __asm__ ("r1") = (sp_digit)d0_p;
+    register sp_digit div __asm__ ("r2") = (sp_digit)div_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -97032,12 +97032,12 @@ WC_OMIT_FRAME_POINTER static sp_digit div_384_word_12(sp_digit d1_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit div_384_word_12(sp_digit d1, sp_digit d0,
     sp_digit div)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit d1 asm ("r0") = (sp_digit)d1_p;
-    register sp_digit d0 asm ("r1") = (sp_digit)d0_p;
-    register sp_digit div asm ("r2") = (sp_digit)div_p;
+    register sp_digit d1 __asm__ ("r0") = (sp_digit)d1_p;
+    register sp_digit d0 __asm__ ("r1") = (sp_digit)d0_p;
+    register sp_digit div __asm__ ("r2") = (sp_digit)div_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -97055,7 +97055,7 @@ WC_OMIT_FRAME_POINTER static sp_digit div_384_word_12(sp_digit d1, sp_digit d0,
         /* Next 30 bits */
         "mov	r12, #29\n\t"
         "\n"
-    "L_div_384_word_12_bit_%=: \n\t"
+    "L_div_384_word_12_bit_%=:\n\t"
         "lsls	r4, r4, #1\n\t"
         "adc	r5, r5, r5\n\t"
         "subs	r6, lr, r5\n\t"
@@ -97763,12 +97763,12 @@ WC_OMIT_FRAME_POINTER static void sp_384_div2_mod_12(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static void sp_384_div2_mod_12(sp_digit* r,
     const sp_digit* a, const sp_digit* m)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* m asm ("r2") = (const sp_digit*)m_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* m __asm__ ("r2") = (const sp_digit*)m_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -97800,7 +97800,7 @@ WC_OMIT_FRAME_POINTER static void sp_384_div2_mod_12(sp_digit* r,
         "adc	r3, r12, r12\n\t"
         "b	L_sp_384_div2_mod_12_div2_%=\n\t"
         "\n"
-    "L_sp_384_div2_mod_12_even_%=: \n\t"
+    "L_sp_384_div2_mod_12_even_%=:\n\t"
         "ldm	%[a]!, {r5, r6, r7}\n\t"
         "stm	%[r]!, {r4, r5, r6, r7}\n\t"
         "ldm	%[a]!, {r4, r5, r6, r7}\n\t"
@@ -97808,7 +97808,7 @@ WC_OMIT_FRAME_POINTER static void sp_384_div2_mod_12(sp_digit* r,
         "ldm	%[a]!, {r4, r5, r6, r7}\n\t"
         "stm	%[r]!, {r4, r5, r6, r7}\n\t"
         "\n"
-    "L_sp_384_div2_mod_12_div2_%=: \n\t"
+    "L_sp_384_div2_mod_12_div2_%=:\n\t"
         "sub	%[r], %[r], #48\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
         "ldm	r0, {r8, r9}\n\t"
@@ -97874,7 +97874,7 @@ WC_OMIT_FRAME_POINTER static void sp_384_div2_mod_12(sp_digit* r,
 }
 
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
-static const byte L_sp_384_num_bits_12_table[] = {
+XALIGNED(4) static const word8 L_sp_384_num_bits_12_table[] = {
     0x00, 0x01, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03,
     0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04,
     0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
@@ -97913,15 +97913,15 @@ static const byte L_sp_384_num_bits_12_table[] = {
 WC_OMIT_FRAME_POINTER static int sp_384_num_bits_12(const sp_digit* a_p)
 #else
 WC_OMIT_FRAME_POINTER static int sp_384_num_bits_12(const sp_digit* a)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register const sp_digit* a asm ("r0") = (const sp_digit*)a_p;
-    register byte* L_sp_384_num_bits_12_table_c asm ("r1") =
-        (byte*)&L_sp_384_num_bits_12_table;
+    register const sp_digit* a __asm__ ("r0") = (const sp_digit*)a_p;
+    register word8* L_sp_384_num_bits_12_table_c __asm__ ("r1") =
+        (word8*)&L_sp_384_num_bits_12_table;
 #else
-    register byte* L_sp_384_num_bits_12_table_c =
-        (byte*)&L_sp_384_num_bits_12_table;
+    register word8* L_sp_384_num_bits_12_table_c =
+        (word8*)&L_sp_384_num_bits_12_table;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -97942,7 +97942,7 @@ WC_OMIT_FRAME_POINTER static int sp_384_num_bits_12(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_384_num_bits_12_13_%=\n\t"
         "\n"
-    "L_sp_384_num_bits_12_11_3_%=: \n\t"
+    "L_sp_384_num_bits_12_11_3_%=:\n\t"
         "lsr	r3, r1, #16\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -97957,7 +97957,7 @@ WC_OMIT_FRAME_POINTER static int sp_384_num_bits_12(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_384_num_bits_12_13_%=\n\t"
         "\n"
-    "L_sp_384_num_bits_12_11_2_%=: \n\t"
+    "L_sp_384_num_bits_12_11_2_%=:\n\t"
         "lsr	r3, r1, #8\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -97972,7 +97972,7 @@ WC_OMIT_FRAME_POINTER static int sp_384_num_bits_12(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_384_num_bits_12_13_%=\n\t"
         "\n"
-    "L_sp_384_num_bits_12_11_1_%=: \n\t"
+    "L_sp_384_num_bits_12_11_1_%=:\n\t"
         "and	r3, r1, #0xff\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
         "mov	r2, #0x60\n\t"
@@ -97984,7 +97984,7 @@ WC_OMIT_FRAME_POINTER static int sp_384_num_bits_12(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_384_num_bits_12_13_%=\n\t"
         "\n"
-    "L_sp_384_num_bits_12_11_%=: \n\t"
+    "L_sp_384_num_bits_12_11_%=:\n\t"
         "ldr	r1, [%[a], #40]\n\t"
         "cmp	r1, #0\n\t"
         "beq	L_sp_384_num_bits_12_10_%=\n\t"
@@ -98001,7 +98001,7 @@ WC_OMIT_FRAME_POINTER static int sp_384_num_bits_12(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_384_num_bits_12_13_%=\n\t"
         "\n"
-    "L_sp_384_num_bits_12_10_3_%=: \n\t"
+    "L_sp_384_num_bits_12_10_3_%=:\n\t"
         "lsr	r3, r1, #16\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -98016,7 +98016,7 @@ WC_OMIT_FRAME_POINTER static int sp_384_num_bits_12(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_384_num_bits_12_13_%=\n\t"
         "\n"
-    "L_sp_384_num_bits_12_10_2_%=: \n\t"
+    "L_sp_384_num_bits_12_10_2_%=:\n\t"
         "lsr	r3, r1, #8\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -98031,7 +98031,7 @@ WC_OMIT_FRAME_POINTER static int sp_384_num_bits_12(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_384_num_bits_12_13_%=\n\t"
         "\n"
-    "L_sp_384_num_bits_12_10_1_%=: \n\t"
+    "L_sp_384_num_bits_12_10_1_%=:\n\t"
         "and	r3, r1, #0xff\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
         "mov	r2, #0x40\n\t"
@@ -98043,7 +98043,7 @@ WC_OMIT_FRAME_POINTER static int sp_384_num_bits_12(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_384_num_bits_12_13_%=\n\t"
         "\n"
-    "L_sp_384_num_bits_12_10_%=: \n\t"
+    "L_sp_384_num_bits_12_10_%=:\n\t"
         "ldr	r1, [%[a], #36]\n\t"
         "cmp	r1, #0\n\t"
         "beq	L_sp_384_num_bits_12_9_%=\n\t"
@@ -98060,7 +98060,7 @@ WC_OMIT_FRAME_POINTER static int sp_384_num_bits_12(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_384_num_bits_12_13_%=\n\t"
         "\n"
-    "L_sp_384_num_bits_12_9_3_%=: \n\t"
+    "L_sp_384_num_bits_12_9_3_%=:\n\t"
         "lsr	r3, r1, #16\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -98075,7 +98075,7 @@ WC_OMIT_FRAME_POINTER static int sp_384_num_bits_12(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_384_num_bits_12_13_%=\n\t"
         "\n"
-    "L_sp_384_num_bits_12_9_2_%=: \n\t"
+    "L_sp_384_num_bits_12_9_2_%=:\n\t"
         "lsr	r3, r1, #8\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -98090,7 +98090,7 @@ WC_OMIT_FRAME_POINTER static int sp_384_num_bits_12(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_384_num_bits_12_13_%=\n\t"
         "\n"
-    "L_sp_384_num_bits_12_9_1_%=: \n\t"
+    "L_sp_384_num_bits_12_9_1_%=:\n\t"
         "and	r3, r1, #0xff\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
         "mov	r2, #0x20\n\t"
@@ -98102,7 +98102,7 @@ WC_OMIT_FRAME_POINTER static int sp_384_num_bits_12(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_384_num_bits_12_13_%=\n\t"
         "\n"
-    "L_sp_384_num_bits_12_9_%=: \n\t"
+    "L_sp_384_num_bits_12_9_%=:\n\t"
         "ldr	r1, [%[a], #32]\n\t"
         "cmp	r1, #0\n\t"
         "beq	L_sp_384_num_bits_12_8_%=\n\t"
@@ -98119,7 +98119,7 @@ WC_OMIT_FRAME_POINTER static int sp_384_num_bits_12(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_384_num_bits_12_13_%=\n\t"
         "\n"
-    "L_sp_384_num_bits_12_8_3_%=: \n\t"
+    "L_sp_384_num_bits_12_8_3_%=:\n\t"
         "lsr	r3, r1, #16\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -98134,7 +98134,7 @@ WC_OMIT_FRAME_POINTER static int sp_384_num_bits_12(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_384_num_bits_12_13_%=\n\t"
         "\n"
-    "L_sp_384_num_bits_12_8_2_%=: \n\t"
+    "L_sp_384_num_bits_12_8_2_%=:\n\t"
         "lsr	r3, r1, #8\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -98149,14 +98149,14 @@ WC_OMIT_FRAME_POINTER static int sp_384_num_bits_12(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_384_num_bits_12_13_%=\n\t"
         "\n"
-    "L_sp_384_num_bits_12_8_1_%=: \n\t"
+    "L_sp_384_num_bits_12_8_1_%=:\n\t"
         "and	r3, r1, #0xff\n\t"
         "mov	r2, #0x100\n\t"
         "ldrb	r12, [lr, r3]\n\t"
         "add	r12, r2, r12\n\t"
         "b	L_sp_384_num_bits_12_13_%=\n\t"
         "\n"
-    "L_sp_384_num_bits_12_8_%=: \n\t"
+    "L_sp_384_num_bits_12_8_%=:\n\t"
         "ldr	r1, [%[a], #28]\n\t"
         "cmp	r1, #0\n\t"
         "beq	L_sp_384_num_bits_12_7_%=\n\t"
@@ -98168,7 +98168,7 @@ WC_OMIT_FRAME_POINTER static int sp_384_num_bits_12(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_384_num_bits_12_13_%=\n\t"
         "\n"
-    "L_sp_384_num_bits_12_7_3_%=: \n\t"
+    "L_sp_384_num_bits_12_7_3_%=:\n\t"
         "lsr	r3, r1, #16\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -98178,7 +98178,7 @@ WC_OMIT_FRAME_POINTER static int sp_384_num_bits_12(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_384_num_bits_12_13_%=\n\t"
         "\n"
-    "L_sp_384_num_bits_12_7_2_%=: \n\t"
+    "L_sp_384_num_bits_12_7_2_%=:\n\t"
         "lsr	r3, r1, #8\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -98188,14 +98188,14 @@ WC_OMIT_FRAME_POINTER static int sp_384_num_bits_12(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_384_num_bits_12_13_%=\n\t"
         "\n"
-    "L_sp_384_num_bits_12_7_1_%=: \n\t"
+    "L_sp_384_num_bits_12_7_1_%=:\n\t"
         "and	r3, r1, #0xff\n\t"
         "mov	r2, #0xe0\n\t"
         "ldrb	r12, [lr, r3]\n\t"
         "add	r12, r2, r12\n\t"
         "b	L_sp_384_num_bits_12_13_%=\n\t"
         "\n"
-    "L_sp_384_num_bits_12_7_%=: \n\t"
+    "L_sp_384_num_bits_12_7_%=:\n\t"
         "ldr	r1, [%[a], #24]\n\t"
         "cmp	r1, #0\n\t"
         "beq	L_sp_384_num_bits_12_6_%=\n\t"
@@ -98207,7 +98207,7 @@ WC_OMIT_FRAME_POINTER static int sp_384_num_bits_12(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_384_num_bits_12_13_%=\n\t"
         "\n"
-    "L_sp_384_num_bits_12_6_3_%=: \n\t"
+    "L_sp_384_num_bits_12_6_3_%=:\n\t"
         "lsr	r3, r1, #16\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -98217,7 +98217,7 @@ WC_OMIT_FRAME_POINTER static int sp_384_num_bits_12(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_384_num_bits_12_13_%=\n\t"
         "\n"
-    "L_sp_384_num_bits_12_6_2_%=: \n\t"
+    "L_sp_384_num_bits_12_6_2_%=:\n\t"
         "lsr	r3, r1, #8\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -98227,14 +98227,14 @@ WC_OMIT_FRAME_POINTER static int sp_384_num_bits_12(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_384_num_bits_12_13_%=\n\t"
         "\n"
-    "L_sp_384_num_bits_12_6_1_%=: \n\t"
+    "L_sp_384_num_bits_12_6_1_%=:\n\t"
         "and	r3, r1, #0xff\n\t"
         "mov	r2, #0xc0\n\t"
         "ldrb	r12, [lr, r3]\n\t"
         "add	r12, r2, r12\n\t"
         "b	L_sp_384_num_bits_12_13_%=\n\t"
         "\n"
-    "L_sp_384_num_bits_12_6_%=: \n\t"
+    "L_sp_384_num_bits_12_6_%=:\n\t"
         "ldr	r1, [%[a], #20]\n\t"
         "cmp	r1, #0\n\t"
         "beq	L_sp_384_num_bits_12_5_%=\n\t"
@@ -98246,7 +98246,7 @@ WC_OMIT_FRAME_POINTER static int sp_384_num_bits_12(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_384_num_bits_12_13_%=\n\t"
         "\n"
-    "L_sp_384_num_bits_12_5_3_%=: \n\t"
+    "L_sp_384_num_bits_12_5_3_%=:\n\t"
         "lsr	r3, r1, #16\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -98256,7 +98256,7 @@ WC_OMIT_FRAME_POINTER static int sp_384_num_bits_12(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_384_num_bits_12_13_%=\n\t"
         "\n"
-    "L_sp_384_num_bits_12_5_2_%=: \n\t"
+    "L_sp_384_num_bits_12_5_2_%=:\n\t"
         "lsr	r3, r1, #8\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -98266,14 +98266,14 @@ WC_OMIT_FRAME_POINTER static int sp_384_num_bits_12(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_384_num_bits_12_13_%=\n\t"
         "\n"
-    "L_sp_384_num_bits_12_5_1_%=: \n\t"
+    "L_sp_384_num_bits_12_5_1_%=:\n\t"
         "and	r3, r1, #0xff\n\t"
         "mov	r2, #0xa0\n\t"
         "ldrb	r12, [lr, r3]\n\t"
         "add	r12, r2, r12\n\t"
         "b	L_sp_384_num_bits_12_13_%=\n\t"
         "\n"
-    "L_sp_384_num_bits_12_5_%=: \n\t"
+    "L_sp_384_num_bits_12_5_%=:\n\t"
         "ldr	r1, [%[a], #16]\n\t"
         "cmp	r1, #0\n\t"
         "beq	L_sp_384_num_bits_12_4_%=\n\t"
@@ -98285,7 +98285,7 @@ WC_OMIT_FRAME_POINTER static int sp_384_num_bits_12(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_384_num_bits_12_13_%=\n\t"
         "\n"
-    "L_sp_384_num_bits_12_4_3_%=: \n\t"
+    "L_sp_384_num_bits_12_4_3_%=:\n\t"
         "lsr	r3, r1, #16\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -98295,7 +98295,7 @@ WC_OMIT_FRAME_POINTER static int sp_384_num_bits_12(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_384_num_bits_12_13_%=\n\t"
         "\n"
-    "L_sp_384_num_bits_12_4_2_%=: \n\t"
+    "L_sp_384_num_bits_12_4_2_%=:\n\t"
         "lsr	r3, r1, #8\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -98305,14 +98305,14 @@ WC_OMIT_FRAME_POINTER static int sp_384_num_bits_12(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_384_num_bits_12_13_%=\n\t"
         "\n"
-    "L_sp_384_num_bits_12_4_1_%=: \n\t"
+    "L_sp_384_num_bits_12_4_1_%=:\n\t"
         "and	r3, r1, #0xff\n\t"
         "mov	r2, #0x80\n\t"
         "ldrb	r12, [lr, r3]\n\t"
         "add	r12, r2, r12\n\t"
         "b	L_sp_384_num_bits_12_13_%=\n\t"
         "\n"
-    "L_sp_384_num_bits_12_4_%=: \n\t"
+    "L_sp_384_num_bits_12_4_%=:\n\t"
         "ldr	r1, [%[a], #12]\n\t"
         "cmp	r1, #0\n\t"
         "beq	L_sp_384_num_bits_12_3_%=\n\t"
@@ -98324,7 +98324,7 @@ WC_OMIT_FRAME_POINTER static int sp_384_num_bits_12(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_384_num_bits_12_13_%=\n\t"
         "\n"
-    "L_sp_384_num_bits_12_3_3_%=: \n\t"
+    "L_sp_384_num_bits_12_3_3_%=:\n\t"
         "lsr	r3, r1, #16\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -98334,7 +98334,7 @@ WC_OMIT_FRAME_POINTER static int sp_384_num_bits_12(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_384_num_bits_12_13_%=\n\t"
         "\n"
-    "L_sp_384_num_bits_12_3_2_%=: \n\t"
+    "L_sp_384_num_bits_12_3_2_%=:\n\t"
         "lsr	r3, r1, #8\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -98344,14 +98344,14 @@ WC_OMIT_FRAME_POINTER static int sp_384_num_bits_12(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_384_num_bits_12_13_%=\n\t"
         "\n"
-    "L_sp_384_num_bits_12_3_1_%=: \n\t"
+    "L_sp_384_num_bits_12_3_1_%=:\n\t"
         "and	r3, r1, #0xff\n\t"
         "mov	r2, #0x60\n\t"
         "ldrb	r12, [lr, r3]\n\t"
         "add	r12, r2, r12\n\t"
         "b	L_sp_384_num_bits_12_13_%=\n\t"
         "\n"
-    "L_sp_384_num_bits_12_3_%=: \n\t"
+    "L_sp_384_num_bits_12_3_%=:\n\t"
         "ldr	r1, [%[a], #8]\n\t"
         "cmp	r1, #0\n\t"
         "beq	L_sp_384_num_bits_12_2_%=\n\t"
@@ -98363,7 +98363,7 @@ WC_OMIT_FRAME_POINTER static int sp_384_num_bits_12(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_384_num_bits_12_13_%=\n\t"
         "\n"
-    "L_sp_384_num_bits_12_2_3_%=: \n\t"
+    "L_sp_384_num_bits_12_2_3_%=:\n\t"
         "lsr	r3, r1, #16\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -98373,7 +98373,7 @@ WC_OMIT_FRAME_POINTER static int sp_384_num_bits_12(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_384_num_bits_12_13_%=\n\t"
         "\n"
-    "L_sp_384_num_bits_12_2_2_%=: \n\t"
+    "L_sp_384_num_bits_12_2_2_%=:\n\t"
         "lsr	r3, r1, #8\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -98383,14 +98383,14 @@ WC_OMIT_FRAME_POINTER static int sp_384_num_bits_12(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_384_num_bits_12_13_%=\n\t"
         "\n"
-    "L_sp_384_num_bits_12_2_1_%=: \n\t"
+    "L_sp_384_num_bits_12_2_1_%=:\n\t"
         "and	r3, r1, #0xff\n\t"
         "mov	r2, #0x40\n\t"
         "ldrb	r12, [lr, r3]\n\t"
         "add	r12, r2, r12\n\t"
         "b	L_sp_384_num_bits_12_13_%=\n\t"
         "\n"
-    "L_sp_384_num_bits_12_2_%=: \n\t"
+    "L_sp_384_num_bits_12_2_%=:\n\t"
         "ldr	r1, [%[a], #4]\n\t"
         "cmp	r1, #0\n\t"
         "beq	L_sp_384_num_bits_12_1_%=\n\t"
@@ -98402,7 +98402,7 @@ WC_OMIT_FRAME_POINTER static int sp_384_num_bits_12(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_384_num_bits_12_13_%=\n\t"
         "\n"
-    "L_sp_384_num_bits_12_1_3_%=: \n\t"
+    "L_sp_384_num_bits_12_1_3_%=:\n\t"
         "lsr	r3, r1, #16\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -98412,7 +98412,7 @@ WC_OMIT_FRAME_POINTER static int sp_384_num_bits_12(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_384_num_bits_12_13_%=\n\t"
         "\n"
-    "L_sp_384_num_bits_12_1_2_%=: \n\t"
+    "L_sp_384_num_bits_12_1_2_%=:\n\t"
         "lsr	r3, r1, #8\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -98422,14 +98422,14 @@ WC_OMIT_FRAME_POINTER static int sp_384_num_bits_12(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_384_num_bits_12_13_%=\n\t"
         "\n"
-    "L_sp_384_num_bits_12_1_1_%=: \n\t"
+    "L_sp_384_num_bits_12_1_1_%=:\n\t"
         "and	r3, r1, #0xff\n\t"
         "mov	r2, #32\n\t"
         "ldrb	r12, [lr, r3]\n\t"
         "add	r12, r2, r12\n\t"
         "b	L_sp_384_num_bits_12_13_%=\n\t"
         "\n"
-    "L_sp_384_num_bits_12_1_%=: \n\t"
+    "L_sp_384_num_bits_12_1_%=:\n\t"
         "ldr	r1, [%[a]]\n\t"
         "lsr	r3, r1, #24\n\t"
         "cmp	r3, #0\n\t"
@@ -98439,7 +98439,7 @@ WC_OMIT_FRAME_POINTER static int sp_384_num_bits_12(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_384_num_bits_12_13_%=\n\t"
         "\n"
-    "L_sp_384_num_bits_12_0_3_%=: \n\t"
+    "L_sp_384_num_bits_12_0_3_%=:\n\t"
         "lsr	r3, r1, #16\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -98449,7 +98449,7 @@ WC_OMIT_FRAME_POINTER static int sp_384_num_bits_12(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_384_num_bits_12_13_%=\n\t"
         "\n"
-    "L_sp_384_num_bits_12_0_2_%=: \n\t"
+    "L_sp_384_num_bits_12_0_2_%=:\n\t"
         "lsr	r3, r1, #8\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -98459,11 +98459,11 @@ WC_OMIT_FRAME_POINTER static int sp_384_num_bits_12(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_384_num_bits_12_13_%=\n\t"
         "\n"
-    "L_sp_384_num_bits_12_0_1_%=: \n\t"
+    "L_sp_384_num_bits_12_0_1_%=:\n\t"
         "and	r3, r1, #0xff\n\t"
         "ldrb	r12, [lr, r3]\n\t"
         "\n"
-    "L_sp_384_num_bits_12_13_%=: \n\t"
+    "L_sp_384_num_bits_12_13_%=:\n\t"
         "mov	%[a], r12\n\t"
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a),
@@ -98484,10 +98484,10 @@ WC_OMIT_FRAME_POINTER static int sp_384_num_bits_12(const sp_digit* a)
 WC_OMIT_FRAME_POINTER static int sp_384_num_bits_12(const sp_digit* a_p)
 #else
 WC_OMIT_FRAME_POINTER static int sp_384_num_bits_12(const sp_digit* a)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register const sp_digit* a asm ("r0") = (const sp_digit*)a_p;
+    register const sp_digit* a __asm__ ("r0") = (const sp_digit*)a_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -98504,7 +98504,7 @@ WC_OMIT_FRAME_POINTER static int sp_384_num_bits_12(const sp_digit* a)
         "sub	r12, r2, r12\n\t"
         "b	L_sp_384_num_bits_12_13_%=\n\t"
         "\n"
-    "L_sp_384_num_bits_12_11_%=: \n\t"
+    "L_sp_384_num_bits_12_11_%=:\n\t"
         "ldr	r1, [%[a], #40]\n\t"
         "cmp	r1, #0\n\t"
         "beq	L_sp_384_num_bits_12_10_%=\n\t"
@@ -98518,7 +98518,7 @@ WC_OMIT_FRAME_POINTER static int sp_384_num_bits_12(const sp_digit* a)
         "sub	r12, r2, r12\n\t"
         "b	L_sp_384_num_bits_12_13_%=\n\t"
         "\n"
-    "L_sp_384_num_bits_12_10_%=: \n\t"
+    "L_sp_384_num_bits_12_10_%=:\n\t"
         "ldr	r1, [%[a], #36]\n\t"
         "cmp	r1, #0\n\t"
         "beq	L_sp_384_num_bits_12_9_%=\n\t"
@@ -98532,7 +98532,7 @@ WC_OMIT_FRAME_POINTER static int sp_384_num_bits_12(const sp_digit* a)
         "sub	r12, r2, r12\n\t"
         "b	L_sp_384_num_bits_12_13_%=\n\t"
         "\n"
-    "L_sp_384_num_bits_12_9_%=: \n\t"
+    "L_sp_384_num_bits_12_9_%=:\n\t"
         "ldr	r1, [%[a], #32]\n\t"
         "cmp	r1, #0\n\t"
         "beq	L_sp_384_num_bits_12_8_%=\n\t"
@@ -98546,7 +98546,7 @@ WC_OMIT_FRAME_POINTER static int sp_384_num_bits_12(const sp_digit* a)
         "sub	r12, r2, r12\n\t"
         "b	L_sp_384_num_bits_12_13_%=\n\t"
         "\n"
-    "L_sp_384_num_bits_12_8_%=: \n\t"
+    "L_sp_384_num_bits_12_8_%=:\n\t"
         "ldr	r1, [%[a], #28]\n\t"
         "cmp	r1, #0\n\t"
         "beq	L_sp_384_num_bits_12_7_%=\n\t"
@@ -98555,7 +98555,7 @@ WC_OMIT_FRAME_POINTER static int sp_384_num_bits_12(const sp_digit* a)
         "sub	r12, r2, r12\n\t"
         "b	L_sp_384_num_bits_12_13_%=\n\t"
         "\n"
-    "L_sp_384_num_bits_12_7_%=: \n\t"
+    "L_sp_384_num_bits_12_7_%=:\n\t"
         "ldr	r1, [%[a], #24]\n\t"
         "cmp	r1, #0\n\t"
         "beq	L_sp_384_num_bits_12_6_%=\n\t"
@@ -98564,7 +98564,7 @@ WC_OMIT_FRAME_POINTER static int sp_384_num_bits_12(const sp_digit* a)
         "sub	r12, r2, r12\n\t"
         "b	L_sp_384_num_bits_12_13_%=\n\t"
         "\n"
-    "L_sp_384_num_bits_12_6_%=: \n\t"
+    "L_sp_384_num_bits_12_6_%=:\n\t"
         "ldr	r1, [%[a], #20]\n\t"
         "cmp	r1, #0\n\t"
         "beq	L_sp_384_num_bits_12_5_%=\n\t"
@@ -98573,7 +98573,7 @@ WC_OMIT_FRAME_POINTER static int sp_384_num_bits_12(const sp_digit* a)
         "sub	r12, r2, r12\n\t"
         "b	L_sp_384_num_bits_12_13_%=\n\t"
         "\n"
-    "L_sp_384_num_bits_12_5_%=: \n\t"
+    "L_sp_384_num_bits_12_5_%=:\n\t"
         "ldr	r1, [%[a], #16]\n\t"
         "cmp	r1, #0\n\t"
         "beq	L_sp_384_num_bits_12_4_%=\n\t"
@@ -98582,7 +98582,7 @@ WC_OMIT_FRAME_POINTER static int sp_384_num_bits_12(const sp_digit* a)
         "sub	r12, r2, r12\n\t"
         "b	L_sp_384_num_bits_12_13_%=\n\t"
         "\n"
-    "L_sp_384_num_bits_12_4_%=: \n\t"
+    "L_sp_384_num_bits_12_4_%=:\n\t"
         "ldr	r1, [%[a], #12]\n\t"
         "cmp	r1, #0\n\t"
         "beq	L_sp_384_num_bits_12_3_%=\n\t"
@@ -98591,7 +98591,7 @@ WC_OMIT_FRAME_POINTER static int sp_384_num_bits_12(const sp_digit* a)
         "sub	r12, r2, r12\n\t"
         "b	L_sp_384_num_bits_12_13_%=\n\t"
         "\n"
-    "L_sp_384_num_bits_12_3_%=: \n\t"
+    "L_sp_384_num_bits_12_3_%=:\n\t"
         "ldr	r1, [%[a], #8]\n\t"
         "cmp	r1, #0\n\t"
         "beq	L_sp_384_num_bits_12_2_%=\n\t"
@@ -98600,7 +98600,7 @@ WC_OMIT_FRAME_POINTER static int sp_384_num_bits_12(const sp_digit* a)
         "sub	r12, r2, r12\n\t"
         "b	L_sp_384_num_bits_12_13_%=\n\t"
         "\n"
-    "L_sp_384_num_bits_12_2_%=: \n\t"
+    "L_sp_384_num_bits_12_2_%=:\n\t"
         "ldr	r1, [%[a], #4]\n\t"
         "cmp	r1, #0\n\t"
         "beq	L_sp_384_num_bits_12_1_%=\n\t"
@@ -98609,13 +98609,13 @@ WC_OMIT_FRAME_POINTER static int sp_384_num_bits_12(const sp_digit* a)
         "sub	r12, r2, r12\n\t"
         "b	L_sp_384_num_bits_12_13_%=\n\t"
         "\n"
-    "L_sp_384_num_bits_12_1_%=: \n\t"
+    "L_sp_384_num_bits_12_1_%=:\n\t"
         "ldr	r1, [%[a]]\n\t"
         "mov	r2, #32\n\t"
         "clz	r12, r1\n\t"
         "sub	r12, r2, r12\n\t"
         "\n"
-    "L_sp_384_num_bits_12_13_%=: \n\t"
+    "L_sp_384_num_bits_12_13_%=:\n\t"
         "mov	%[a], r12\n\t"
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a)
@@ -99618,12 +99618,12 @@ WC_OMIT_FRAME_POINTER static void sp_521_mul_17(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static void sp_521_mul_17(sp_digit* r, const sp_digit* a,
     const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -99636,13 +99636,13 @@ WC_OMIT_FRAME_POINTER static void sp_521_mul_17(sp_digit* r, const sp_digit* a,
         "mov	r8, #0\n\t"
         "mov	r5, #4\n\t"
         "\n"
-    "L_sp_521_mul_17_outer_%=: \n\t"
+    "L_sp_521_mul_17_outer_%=:\n\t"
         "subs	r3, r5, #0x40\n\t"
         "it	cc\n\t"
         "movcc	r3, #0\n\t"
         "sub	r4, r5, r3\n\t"
         "\n"
-    "L_sp_521_mul_17_inner_%=: \n\t"
+    "L_sp_521_mul_17_inner_%=:\n\t"
         "ldr	lr, [%[a], r3]\n\t"
         "ldr	r11, [%[b], r4]\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
@@ -99760,7 +99760,7 @@ WC_OMIT_FRAME_POINTER static void sp_521_mul_17(sp_digit* r, const sp_digit* a,
         "adc	r8, r8, #0\n\t"
 #endif
         "\n"
-    "L_sp_521_mul_17_inner_done_%=: \n\t"
+    "L_sp_521_mul_17_inner_done_%=:\n\t"
         "str	r6, [sp, r5]\n\t"
         "mov	r6, r7\n\t"
         "mov	r7, r8\n\t"
@@ -99805,7 +99805,7 @@ WC_OMIT_FRAME_POINTER static void sp_521_mul_17(sp_digit* r, const sp_digit* a,
         "stm	%[r]!, {r6, r7}\n\t"
         "sub	r5, r5, #8\n\t"
         "\n"
-    "L_sp_521_mul_17_store_%=: \n\t"
+    "L_sp_521_mul_17_store_%=:\n\t"
         "ldm	sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
         "stm	%[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
         "subs	r5, r5, #32\n\t"
@@ -99835,12 +99835,12 @@ WC_OMIT_FRAME_POINTER static void sp_521_mul_17(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static void sp_521_mul_17(sp_digit* r, const sp_digit* a,
     const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -110864,11 +110864,11 @@ WC_OMIT_FRAME_POINTER static void sp_521_sqr_17(sp_digit* r_p,
     const sp_digit* a_p)
 #else
 WC_OMIT_FRAME_POINTER static void sp_521_sqr_17(sp_digit* r, const sp_digit* a)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -110880,13 +110880,13 @@ WC_OMIT_FRAME_POINTER static void sp_521_sqr_17(sp_digit* r, const sp_digit* a)
         "mov	r8, #0\n\t"
         "mov	r5, #4\n\t"
         "\n"
-    "L_sp_521_sqr_17_outer_%=: \n\t"
+    "L_sp_521_sqr_17_outer_%=:\n\t"
         "subs	r3, r5, #0x40\n\t"
         "it	cc\n\t"
         "movcc	r3, #0\n\t"
         "sub	r4, r5, r3\n\t"
         "\n"
-    "L_sp_521_sqr_17_inner_%=: \n\t"
+    "L_sp_521_sqr_17_inner_%=:\n\t"
         "ldr	lr, [%[a], r3]\n\t"
         "ldr	r11, [%[a], r4]\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
@@ -110971,7 +110971,7 @@ WC_OMIT_FRAME_POINTER static void sp_521_sqr_17(sp_digit* r, const sp_digit* a)
         "adc	r8, r8, #0\n\t"
 #endif
         "\n"
-    "L_sp_521_sqr_17_inner_done_%=: \n\t"
+    "L_sp_521_sqr_17_inner_done_%=:\n\t"
         "str	r6, [sp, r5]\n\t"
         "mov	r6, r7\n\t"
         "mov	r7, r8\n\t"
@@ -111010,7 +111010,7 @@ WC_OMIT_FRAME_POINTER static void sp_521_sqr_17(sp_digit* r, const sp_digit* a)
         "stm	%[r]!, {r6, r7}\n\t"
         "sub	r5, r5, #8\n\t"
         "\n"
-    "L_sp_521_sqr_17_store_%=: \n\t"
+    "L_sp_521_sqr_17_store_%=:\n\t"
         "ldm	sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
         "stm	%[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
         "subs	r5, r5, #32\n\t"
@@ -111038,11 +111038,11 @@ WC_OMIT_FRAME_POINTER static void sp_521_sqr_17(sp_digit* r_p,
     const sp_digit* a_p)
 #else
 WC_OMIT_FRAME_POINTER static void sp_521_sqr_17(sp_digit* r, const sp_digit* a)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -116926,19 +116926,19 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_521_add_17(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_521_add_17(sp_digit* r,
     const sp_digit* a, const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
         "mov	r3, #0\n\t"
         "add	r12, %[a], #0x40\n\t"
         "\n"
-    "L_sp_521_add_17_word_%=: \n\t"
+    "L_sp_521_add_17_word_%=:\n\t"
         "adds	r3, r3, #-1\n\t"
         "ldm	%[a]!, {r4, r5, r6, r7}\n\t"
         "ldm	%[b]!, {r8, r9, r10, r11}\n\t"
@@ -116984,12 +116984,12 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_521_add_17(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_521_add_17(sp_digit* r,
     const sp_digit* a, const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -117268,13 +117268,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_521_cond_sub_17(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_521_cond_sub_17(sp_digit* r,
     const sp_digit* a, const sp_digit* b, sp_digit m)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
-    register sp_digit m asm ("r3") = (sp_digit)m_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
+    register sp_digit m __asm__ ("r3") = (sp_digit)m_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -117282,7 +117282,7 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_521_cond_sub_17(sp_digit* r,
         "mov	r12, #0\n\t"
         "mov	lr, #0\n\t"
         "\n"
-    "L_sp_521_cond_sub_17_words_%=: \n\t"
+    "L_sp_521_cond_sub_17_words_%=:\n\t"
         "subs	r12, r6, r12\n\t"
         "ldr	r4, [%[a], lr]\n\t"
         "ldr	r5, [%[b], lr]\n\t"
@@ -117321,13 +117321,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_521_cond_sub_17(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_521_cond_sub_17(sp_digit* r,
     const sp_digit* a, const sp_digit* b, sp_digit m)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
-    register sp_digit m asm ("r3") = (sp_digit)m_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
+    register sp_digit m __asm__ ("r3") = (sp_digit)m_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -117419,10 +117419,10 @@ WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_521_mont_reduce_17(
 #else
 WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_521_mont_reduce_17(sp_digit* a,
     const sp_digit* m, sp_digit mp)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* a asm ("r0") = (sp_digit*)a_p;
+    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -117569,12 +117569,12 @@ WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_521_mont_reduce_order_17(
 #else
 WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_521_mont_reduce_order_17(
     sp_digit* a, const sp_digit* m, sp_digit mp)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* a asm ("r0") = (sp_digit*)a_p;
-    register const sp_digit* m asm ("r1") = (const sp_digit*)m_p;
-    register sp_digit mp asm ("r2") = (sp_digit)mp_p;
+    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
+    register const sp_digit* m __asm__ ("r1") = (const sp_digit*)m_p;
+    register sp_digit mp __asm__ ("r2") = (sp_digit)mp_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -117587,7 +117587,7 @@ WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_521_mont_reduce_order_17(
         "ldr	r12, [%[a]]\n\t"
         "ldr	lr, [%[a], #4]\n\t"
         "\n"
-    "L_sp_521_mont_reduce_order_17_word_%=: \n\t"
+    "L_sp_521_mont_reduce_order_17_word_%=:\n\t"
         /* mu = a[i] * mp */
         "mul	r8, %[mp], r12\n\t"
         "cmp	r9, #0x40\n\t"
@@ -117600,7 +117600,7 @@ WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_521_mont_reduce_order_17(
 #endif
         "and	r8, r8, r7\n\t"
         "\n"
-    "L_sp_521_mont_reduce_order_17_nomask_%=: \n\t"
+    "L_sp_521_mont_reduce_order_17_nomask_%=:\n\t"
         /* a[i+0] += m[0] * mu */
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
         "ldr	r11, [%[m]]\n\t"
@@ -118210,12 +118210,12 @@ WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_521_mont_reduce_order_17(
 #else
 WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_521_mont_reduce_order_17(
     sp_digit* a, const sp_digit* m, sp_digit mp)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* a asm ("r0") = (sp_digit*)a_p;
-    register const sp_digit* m asm ("r1") = (const sp_digit*)m_p;
-    register sp_digit mp asm ("r2") = (sp_digit)mp_p;
+    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
+    register const sp_digit* m __asm__ ("r1") = (const sp_digit*)m_p;
+    register sp_digit mp __asm__ ("r2") = (sp_digit)mp_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -118226,7 +118226,7 @@ WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_521_mont_reduce_order_17(
         "ldr	r12, [%[a]]\n\t"
         "ldr	lr, [%[a], #4]\n\t"
         "\n"
-    "L_sp_521_mont_reduce_order_17_word_%=: \n\t"
+    "L_sp_521_mont_reduce_order_17_word_%=:\n\t"
         /* mu = a[i] * mp */
         "mul	r8, %[mp], r12\n\t"
         "cmp	r9, #0x40\n\t"
@@ -118239,7 +118239,7 @@ WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_521_mont_reduce_order_17(
 #endif
         "and	r8, r8, r7\n\t"
         "\n"
-    "L_sp_521_mont_reduce_order_17_nomask_%=: \n\t"
+    "L_sp_521_mont_reduce_order_17_nomask_%=:\n\t"
         /* a[i+0] += m[0] * mu */
         "mov	r5, #0\n\t"
         "umlal	r12, r5, r8, r11\n\t"
@@ -118481,12 +118481,12 @@ WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_521_mont_reduce_order_17(
 #else
 WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_521_mont_reduce_order_17(
     sp_digit* a, const sp_digit* m, sp_digit mp)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* a asm ("r0") = (sp_digit*)a_p;
-    register const sp_digit* m asm ("r1") = (const sp_digit*)m_p;
-    register sp_digit mp asm ("r2") = (sp_digit)mp_p;
+    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
+    register const sp_digit* m __asm__ ("r1") = (const sp_digit*)m_p;
+    register sp_digit mp __asm__ ("r2") = (sp_digit)mp_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -118499,7 +118499,7 @@ WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_521_mont_reduce_order_17(
         "ldr	r7, [%[a], #12]\n\t"
         "ldr	r8, [%[a], #16]\n\t"
         "\n"
-    "L_sp_521_mont_reduce_order_17_word_%=: \n\t"
+    "L_sp_521_mont_reduce_order_17_word_%=:\n\t"
         /* mu = a[i] * mp */
         "mul	r11, %[mp], r4\n\t"
         "cmp	r12, #0x40\n\t"
@@ -118512,7 +118512,7 @@ WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_521_mont_reduce_order_17(
 #endif
         "and	r11, r11, r10\n\t"
         "\n"
-    "L_sp_521_mont_reduce_order_17_nomask_%=: \n\t"
+    "L_sp_521_mont_reduce_order_17_nomask_%=:\n\t"
         /* a[i+0] += m[0] * mu */
         "ldr	r10, [%[m]]\n\t"
         "mov	r3, #0\n\t"
@@ -118847,11 +118847,11 @@ WC_OMIT_FRAME_POINTER static sp_int32 sp_521_cmp_17(const sp_digit* a_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_int32 sp_521_cmp_17(const sp_digit* a,
     const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register const sp_digit* a asm ("r0") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r1") = (const sp_digit*)b_p;
+    register const sp_digit* a __asm__ ("r0") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r1") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -118862,7 +118862,7 @@ WC_OMIT_FRAME_POINTER static sp_int32 sp_521_cmp_17(const sp_digit* a,
 #ifdef WOLFSSL_SP_SMALL
         "mov	r4, #0x40\n\t"
         "\n"
-    "L_sp_521_cmp_17_words_%=: \n\t"
+    "L_sp_521_cmp_17_words_%=:\n\t"
         "ldr	r12, [%[a], r4]\n\t"
         "ldr	lr, [%[b], r4]\n\t"
         "and	r12, r12, r3\n\t"
@@ -119139,12 +119139,12 @@ WC_OMIT_FRAME_POINTER static void sp_521_mont_add_17(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static void sp_521_mont_add_17(sp_digit* r,
     const sp_digit* a, const sp_digit* b, const sp_digit* m)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -119242,11 +119242,11 @@ WC_OMIT_FRAME_POINTER static void sp_521_mont_dbl_17(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static void sp_521_mont_dbl_17(sp_digit* r,
     const sp_digit* a, const sp_digit* m)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -119335,11 +119335,11 @@ WC_OMIT_FRAME_POINTER static void sp_521_mont_tpl_17(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static void sp_521_mont_tpl_17(sp_digit* r,
     const sp_digit* a, const sp_digit* m)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -119463,12 +119463,12 @@ WC_OMIT_FRAME_POINTER static void sp_521_mont_sub_17(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static void sp_521_mont_sub_17(sp_digit* r,
     const sp_digit* a, const sp_digit* b, const sp_digit* m)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -119561,11 +119561,11 @@ WC_OMIT_FRAME_POINTER static void sp_521_rshift1_17(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static void sp_521_rshift1_17(sp_digit* r,
     const sp_digit* a)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -123658,10 +123658,10 @@ int sp_ecc_mulmod_base_add_521(const mp_int* km, const ecc_point* am,
 WC_OMIT_FRAME_POINTER static void sp_521_add_one_17(sp_digit* a_p)
 #else
 WC_OMIT_FRAME_POINTER static void sp_521_add_one_17(sp_digit* a)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* a asm ("r0") = (sp_digit*)a_p;
+    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -124051,12 +124051,12 @@ WC_OMIT_FRAME_POINTER static void sp_521_rshift_17(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static void sp_521_rshift_17(sp_digit* r,
     const sp_digit* a, byte n)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register byte n asm ("r2") = (byte)n_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register byte n __asm__ ("r2") = (byte)n_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -124172,12 +124172,12 @@ WC_OMIT_FRAME_POINTER static void sp_521_lshift_17(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static void sp_521_lshift_17(sp_digit* r,
     const sp_digit* a, byte n)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register byte n asm ("r2") = (byte)n_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register byte n __asm__ ("r2") = (byte)n_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -124301,12 +124301,12 @@ WC_OMIT_FRAME_POINTER static void sp_521_lshift_34(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static void sp_521_lshift_34(sp_digit* r,
     const sp_digit* a, byte n)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register byte n asm ("r2") = (byte)n_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register byte n __asm__ ("r2") = (byte)n_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -124538,18 +124538,18 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_521_sub_in_place_17(sp_digit* a_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_521_sub_in_place_17(sp_digit* a,
     const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* a asm ("r0") = (sp_digit*)a_p;
-    register const sp_digit* b asm ("r1") = (const sp_digit*)b_p;
+    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r1") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
         "mov	r12, #0\n\t"
         "add	lr, %[a], #0x40\n\t"
         "\n"
-    "L_sp_521_sub_in_place_17_word_%=: \n\t"
+    "L_sp_521_sub_in_place_17_word_%=:\n\t"
         "rsbs	r12, r12, #0\n\t"
         "ldm	%[a], {r2, r3, r4, r5}\n\t"
         "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
@@ -124592,11 +124592,11 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_521_sub_in_place_17(sp_digit* a_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_521_sub_in_place_17(sp_digit* a,
     const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* a asm ("r0") = (sp_digit*)a_p;
-    register const sp_digit* b asm ("r1") = (const sp_digit*)b_p;
+    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r1") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -124659,12 +124659,12 @@ WC_OMIT_FRAME_POINTER static void sp_521_mul_d_17(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static void sp_521_mul_d_17(sp_digit* r,
     const sp_digit* a, sp_digit b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register sp_digit b asm ("r2") = (sp_digit)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register sp_digit b __asm__ ("r2") = (sp_digit)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -124700,7 +124700,7 @@ WC_OMIT_FRAME_POINTER static void sp_521_mul_d_17(sp_digit* r,
         "mov	r5, #0\n\t"
         "mov	r9, #4\n\t"
         "\n"
-    "L_sp_521_mul_d_17_word_%=: \n\t"
+    "L_sp_521_mul_d_17_word_%=:\n\t"
         /* A[i] * B */
         "ldr	r8, [%[a], r9]\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
@@ -124770,12 +124770,12 @@ WC_OMIT_FRAME_POINTER static void sp_521_mul_d_17(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static void sp_521_mul_d_17(sp_digit* r,
     const sp_digit* a, sp_digit b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register sp_digit b asm ("r2") = (sp_digit)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register sp_digit b __asm__ ("r2") = (sp_digit)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -125348,12 +125348,12 @@ WC_OMIT_FRAME_POINTER static sp_digit div_521_word_17(sp_digit d1_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit div_521_word_17(sp_digit d1, sp_digit d0,
     sp_digit div)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit d1 asm ("r0") = (sp_digit)d1_p;
-    register sp_digit d0 asm ("r1") = (sp_digit)d0_p;
-    register sp_digit div asm ("r2") = (sp_digit)div_p;
+    register sp_digit d1 __asm__ ("r0") = (sp_digit)d1_p;
+    register sp_digit d0 __asm__ ("r1") = (sp_digit)d0_p;
+    register sp_digit div __asm__ ("r2") = (sp_digit)div_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -125420,12 +125420,12 @@ WC_OMIT_FRAME_POINTER static sp_digit div_521_word_17(sp_digit d1_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit div_521_word_17(sp_digit d1, sp_digit d0,
     sp_digit div)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit d1 asm ("r0") = (sp_digit)d1_p;
-    register sp_digit d0 asm ("r1") = (sp_digit)d0_p;
-    register sp_digit div asm ("r2") = (sp_digit)div_p;
+    register sp_digit d1 __asm__ ("r0") = (sp_digit)d1_p;
+    register sp_digit d0 __asm__ ("r1") = (sp_digit)d0_p;
+    register sp_digit div __asm__ ("r2") = (sp_digit)div_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -125443,7 +125443,7 @@ WC_OMIT_FRAME_POINTER static sp_digit div_521_word_17(sp_digit d1, sp_digit d0,
         /* Next 30 bits */
         "mov	r12, #29\n\t"
         "\n"
-    "L_div_521_word_17_bit_%=: \n\t"
+    "L_div_521_word_17_bit_%=:\n\t"
         "lsls	r4, r4, #1\n\t"
         "adc	r5, r5, r5\n\t"
         "subs	r6, lr, r5\n\t"
@@ -126182,19 +126182,19 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_521_sub_17(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_521_sub_17(sp_digit* r,
     const sp_digit* a, const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
         "mov	r12, #0\n\t"
         "add	lr, %[a], #0x40\n\t"
         "\n"
-    "L_sp_521_sub_17_word_%=: \n\t"
+    "L_sp_521_sub_17_word_%=:\n\t"
         "rsbs	r12, r12, #0\n\t"
         "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
         "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
@@ -126238,12 +126238,12 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_521_sub_17(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_521_sub_17(sp_digit* r,
     const sp_digit* a, const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -126305,12 +126305,12 @@ WC_OMIT_FRAME_POINTER static void sp_521_div2_mod_17(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static void sp_521_div2_mod_17(sp_digit* r,
     const sp_digit* a, const sp_digit* m)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* m asm ("r2") = (const sp_digit*)m_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* m __asm__ ("r2") = (const sp_digit*)m_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -126353,7 +126353,7 @@ WC_OMIT_FRAME_POINTER static void sp_521_div2_mod_17(sp_digit* r,
         "adc	r3, r12, r12\n\t"
         "b	L_sp_521_div2_mod_17_div2_%=\n\t"
         "\n"
-    "L_sp_521_div2_mod_17_even_%=: \n\t"
+    "L_sp_521_div2_mod_17_even_%=:\n\t"
         "ldm	%[a]!, {r5, r6, r7}\n\t"
         "stm	%[r]!, {r4, r5, r6, r7}\n\t"
         "ldm	%[a]!, {r4, r5, r6, r7}\n\t"
@@ -126365,7 +126365,7 @@ WC_OMIT_FRAME_POINTER static void sp_521_div2_mod_17(sp_digit* r,
         "ldm	%[a]!, {r4}\n\t"
         "stm	%[r]!, {r4}\n\t"
         "\n"
-    "L_sp_521_div2_mod_17_div2_%=: \n\t"
+    "L_sp_521_div2_mod_17_div2_%=:\n\t"
         "sub	%[r], %[r], #0x44\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
         "ldm	r0, {r8, r9}\n\t"
@@ -126451,7 +126451,7 @@ WC_OMIT_FRAME_POINTER static void sp_521_div2_mod_17(sp_digit* r,
 }
 
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
-static const byte L_sp_521_num_bits_17_table[] = {
+XALIGNED(4) static const word8 L_sp_521_num_bits_17_table[] = {
     0x00, 0x01, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03,
     0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04,
     0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
@@ -126490,15 +126490,15 @@ static const byte L_sp_521_num_bits_17_table[] = {
 WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a_p)
 #else
 WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register const sp_digit* a asm ("r0") = (const sp_digit*)a_p;
-    register byte* L_sp_521_num_bits_17_table_c asm ("r1") =
-        (byte*)&L_sp_521_num_bits_17_table;
+    register const sp_digit* a __asm__ ("r0") = (const sp_digit*)a_p;
+    register word8* L_sp_521_num_bits_17_table_c __asm__ ("r1") =
+        (word8*)&L_sp_521_num_bits_17_table;
 #else
-    register byte* L_sp_521_num_bits_17_table_c =
-        (byte*)&L_sp_521_num_bits_17_table;
+    register word8* L_sp_521_num_bits_17_table_c =
+        (word8*)&L_sp_521_num_bits_17_table;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -126519,7 +126519,7 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_16_3_%=: \n\t"
+    "L_sp_521_num_bits_17_16_3_%=:\n\t"
         "lsr	r3, r1, #16\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -126534,7 +126534,7 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_16_2_%=: \n\t"
+    "L_sp_521_num_bits_17_16_2_%=:\n\t"
         "lsr	r3, r1, #8\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -126549,14 +126549,14 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_16_1_%=: \n\t"
+    "L_sp_521_num_bits_17_16_1_%=:\n\t"
         "and	r3, r1, #0xff\n\t"
         "mov	r2, #0x200\n\t"
         "ldrb	r12, [lr, r3]\n\t"
         "add	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_16_%=: \n\t"
+    "L_sp_521_num_bits_17_16_%=:\n\t"
         "ldr	r1, [%[a], #60]\n\t"
         "cmp	r1, #0\n\t"
         "beq	L_sp_521_num_bits_17_15_%=\n\t"
@@ -126573,7 +126573,7 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_15_3_%=: \n\t"
+    "L_sp_521_num_bits_17_15_3_%=:\n\t"
         "lsr	r3, r1, #16\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -126588,7 +126588,7 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_15_2_%=: \n\t"
+    "L_sp_521_num_bits_17_15_2_%=:\n\t"
         "lsr	r3, r1, #8\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -126603,7 +126603,7 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_15_1_%=: \n\t"
+    "L_sp_521_num_bits_17_15_1_%=:\n\t"
         "and	r3, r1, #0xff\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
         "mov	r2, #0xe0\n\t"
@@ -126615,7 +126615,7 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_15_%=: \n\t"
+    "L_sp_521_num_bits_17_15_%=:\n\t"
         "ldr	r1, [%[a], #56]\n\t"
         "cmp	r1, #0\n\t"
         "beq	L_sp_521_num_bits_17_14_%=\n\t"
@@ -126632,7 +126632,7 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_14_3_%=: \n\t"
+    "L_sp_521_num_bits_17_14_3_%=:\n\t"
         "lsr	r3, r1, #16\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -126647,7 +126647,7 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_14_2_%=: \n\t"
+    "L_sp_521_num_bits_17_14_2_%=:\n\t"
         "lsr	r3, r1, #8\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -126662,7 +126662,7 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_14_1_%=: \n\t"
+    "L_sp_521_num_bits_17_14_1_%=:\n\t"
         "and	r3, r1, #0xff\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
         "mov	r2, #0xc0\n\t"
@@ -126674,7 +126674,7 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_14_%=: \n\t"
+    "L_sp_521_num_bits_17_14_%=:\n\t"
         "ldr	r1, [%[a], #52]\n\t"
         "cmp	r1, #0\n\t"
         "beq	L_sp_521_num_bits_17_13_%=\n\t"
@@ -126691,7 +126691,7 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_13_3_%=: \n\t"
+    "L_sp_521_num_bits_17_13_3_%=:\n\t"
         "lsr	r3, r1, #16\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -126706,7 +126706,7 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_13_2_%=: \n\t"
+    "L_sp_521_num_bits_17_13_2_%=:\n\t"
         "lsr	r3, r1, #8\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -126721,7 +126721,7 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_13_1_%=: \n\t"
+    "L_sp_521_num_bits_17_13_1_%=:\n\t"
         "and	r3, r1, #0xff\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
         "mov	r2, #0xa0\n\t"
@@ -126733,7 +126733,7 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_13_%=: \n\t"
+    "L_sp_521_num_bits_17_13_%=:\n\t"
         "ldr	r1, [%[a], #48]\n\t"
         "cmp	r1, #0\n\t"
         "beq	L_sp_521_num_bits_17_12_%=\n\t"
@@ -126750,7 +126750,7 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_12_3_%=: \n\t"
+    "L_sp_521_num_bits_17_12_3_%=:\n\t"
         "lsr	r3, r1, #16\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -126765,7 +126765,7 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_12_2_%=: \n\t"
+    "L_sp_521_num_bits_17_12_2_%=:\n\t"
         "lsr	r3, r1, #8\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -126780,7 +126780,7 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_12_1_%=: \n\t"
+    "L_sp_521_num_bits_17_12_1_%=:\n\t"
         "and	r3, r1, #0xff\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
         "mov	r2, #0x80\n\t"
@@ -126792,7 +126792,7 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_12_%=: \n\t"
+    "L_sp_521_num_bits_17_12_%=:\n\t"
         "ldr	r1, [%[a], #44]\n\t"
         "cmp	r1, #0\n\t"
         "beq	L_sp_521_num_bits_17_11_%=\n\t"
@@ -126809,7 +126809,7 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_11_3_%=: \n\t"
+    "L_sp_521_num_bits_17_11_3_%=:\n\t"
         "lsr	r3, r1, #16\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -126824,7 +126824,7 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_11_2_%=: \n\t"
+    "L_sp_521_num_bits_17_11_2_%=:\n\t"
         "lsr	r3, r1, #8\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -126839,7 +126839,7 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_11_1_%=: \n\t"
+    "L_sp_521_num_bits_17_11_1_%=:\n\t"
         "and	r3, r1, #0xff\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
         "mov	r2, #0x60\n\t"
@@ -126851,7 +126851,7 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_11_%=: \n\t"
+    "L_sp_521_num_bits_17_11_%=:\n\t"
         "ldr	r1, [%[a], #40]\n\t"
         "cmp	r1, #0\n\t"
         "beq	L_sp_521_num_bits_17_10_%=\n\t"
@@ -126868,7 +126868,7 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_10_3_%=: \n\t"
+    "L_sp_521_num_bits_17_10_3_%=:\n\t"
         "lsr	r3, r1, #16\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -126883,7 +126883,7 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_10_2_%=: \n\t"
+    "L_sp_521_num_bits_17_10_2_%=:\n\t"
         "lsr	r3, r1, #8\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -126898,7 +126898,7 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_10_1_%=: \n\t"
+    "L_sp_521_num_bits_17_10_1_%=:\n\t"
         "and	r3, r1, #0xff\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
         "mov	r2, #0x40\n\t"
@@ -126910,7 +126910,7 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_10_%=: \n\t"
+    "L_sp_521_num_bits_17_10_%=:\n\t"
         "ldr	r1, [%[a], #36]\n\t"
         "cmp	r1, #0\n\t"
         "beq	L_sp_521_num_bits_17_9_%=\n\t"
@@ -126927,7 +126927,7 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_9_3_%=: \n\t"
+    "L_sp_521_num_bits_17_9_3_%=:\n\t"
         "lsr	r3, r1, #16\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -126942,7 +126942,7 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_9_2_%=: \n\t"
+    "L_sp_521_num_bits_17_9_2_%=:\n\t"
         "lsr	r3, r1, #8\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -126957,7 +126957,7 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_9_1_%=: \n\t"
+    "L_sp_521_num_bits_17_9_1_%=:\n\t"
         "and	r3, r1, #0xff\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
         "mov	r2, #0x20\n\t"
@@ -126969,7 +126969,7 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_9_%=: \n\t"
+    "L_sp_521_num_bits_17_9_%=:\n\t"
         "ldr	r1, [%[a], #32]\n\t"
         "cmp	r1, #0\n\t"
         "beq	L_sp_521_num_bits_17_8_%=\n\t"
@@ -126986,7 +126986,7 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_8_3_%=: \n\t"
+    "L_sp_521_num_bits_17_8_3_%=:\n\t"
         "lsr	r3, r1, #16\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -127001,7 +127001,7 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_8_2_%=: \n\t"
+    "L_sp_521_num_bits_17_8_2_%=:\n\t"
         "lsr	r3, r1, #8\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -127016,14 +127016,14 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_8_1_%=: \n\t"
+    "L_sp_521_num_bits_17_8_1_%=:\n\t"
         "and	r3, r1, #0xff\n\t"
         "mov	r2, #0x100\n\t"
         "ldrb	r12, [lr, r3]\n\t"
         "add	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_8_%=: \n\t"
+    "L_sp_521_num_bits_17_8_%=:\n\t"
         "ldr	r1, [%[a], #28]\n\t"
         "cmp	r1, #0\n\t"
         "beq	L_sp_521_num_bits_17_7_%=\n\t"
@@ -127035,7 +127035,7 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_7_3_%=: \n\t"
+    "L_sp_521_num_bits_17_7_3_%=:\n\t"
         "lsr	r3, r1, #16\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -127045,7 +127045,7 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_7_2_%=: \n\t"
+    "L_sp_521_num_bits_17_7_2_%=:\n\t"
         "lsr	r3, r1, #8\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -127055,14 +127055,14 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_7_1_%=: \n\t"
+    "L_sp_521_num_bits_17_7_1_%=:\n\t"
         "and	r3, r1, #0xff\n\t"
         "mov	r2, #0xe0\n\t"
         "ldrb	r12, [lr, r3]\n\t"
         "add	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_7_%=: \n\t"
+    "L_sp_521_num_bits_17_7_%=:\n\t"
         "ldr	r1, [%[a], #24]\n\t"
         "cmp	r1, #0\n\t"
         "beq	L_sp_521_num_bits_17_6_%=\n\t"
@@ -127074,7 +127074,7 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_6_3_%=: \n\t"
+    "L_sp_521_num_bits_17_6_3_%=:\n\t"
         "lsr	r3, r1, #16\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -127084,7 +127084,7 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_6_2_%=: \n\t"
+    "L_sp_521_num_bits_17_6_2_%=:\n\t"
         "lsr	r3, r1, #8\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -127094,14 +127094,14 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_6_1_%=: \n\t"
+    "L_sp_521_num_bits_17_6_1_%=:\n\t"
         "and	r3, r1, #0xff\n\t"
         "mov	r2, #0xc0\n\t"
         "ldrb	r12, [lr, r3]\n\t"
         "add	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_6_%=: \n\t"
+    "L_sp_521_num_bits_17_6_%=:\n\t"
         "ldr	r1, [%[a], #20]\n\t"
         "cmp	r1, #0\n\t"
         "beq	L_sp_521_num_bits_17_5_%=\n\t"
@@ -127113,7 +127113,7 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_5_3_%=: \n\t"
+    "L_sp_521_num_bits_17_5_3_%=:\n\t"
         "lsr	r3, r1, #16\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -127123,7 +127123,7 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_5_2_%=: \n\t"
+    "L_sp_521_num_bits_17_5_2_%=:\n\t"
         "lsr	r3, r1, #8\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -127133,14 +127133,14 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_5_1_%=: \n\t"
+    "L_sp_521_num_bits_17_5_1_%=:\n\t"
         "and	r3, r1, #0xff\n\t"
         "mov	r2, #0xa0\n\t"
         "ldrb	r12, [lr, r3]\n\t"
         "add	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_5_%=: \n\t"
+    "L_sp_521_num_bits_17_5_%=:\n\t"
         "ldr	r1, [%[a], #16]\n\t"
         "cmp	r1, #0\n\t"
         "beq	L_sp_521_num_bits_17_4_%=\n\t"
@@ -127152,7 +127152,7 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_4_3_%=: \n\t"
+    "L_sp_521_num_bits_17_4_3_%=:\n\t"
         "lsr	r3, r1, #16\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -127162,7 +127162,7 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_4_2_%=: \n\t"
+    "L_sp_521_num_bits_17_4_2_%=:\n\t"
         "lsr	r3, r1, #8\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -127172,14 +127172,14 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_4_1_%=: \n\t"
+    "L_sp_521_num_bits_17_4_1_%=:\n\t"
         "and	r3, r1, #0xff\n\t"
         "mov	r2, #0x80\n\t"
         "ldrb	r12, [lr, r3]\n\t"
         "add	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_4_%=: \n\t"
+    "L_sp_521_num_bits_17_4_%=:\n\t"
         "ldr	r1, [%[a], #12]\n\t"
         "cmp	r1, #0\n\t"
         "beq	L_sp_521_num_bits_17_3_%=\n\t"
@@ -127191,7 +127191,7 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_3_3_%=: \n\t"
+    "L_sp_521_num_bits_17_3_3_%=:\n\t"
         "lsr	r3, r1, #16\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -127201,7 +127201,7 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_3_2_%=: \n\t"
+    "L_sp_521_num_bits_17_3_2_%=:\n\t"
         "lsr	r3, r1, #8\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -127211,14 +127211,14 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_3_1_%=: \n\t"
+    "L_sp_521_num_bits_17_3_1_%=:\n\t"
         "and	r3, r1, #0xff\n\t"
         "mov	r2, #0x60\n\t"
         "ldrb	r12, [lr, r3]\n\t"
         "add	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_3_%=: \n\t"
+    "L_sp_521_num_bits_17_3_%=:\n\t"
         "ldr	r1, [%[a], #8]\n\t"
         "cmp	r1, #0\n\t"
         "beq	L_sp_521_num_bits_17_2_%=\n\t"
@@ -127230,7 +127230,7 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_2_3_%=: \n\t"
+    "L_sp_521_num_bits_17_2_3_%=:\n\t"
         "lsr	r3, r1, #16\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -127240,7 +127240,7 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_2_2_%=: \n\t"
+    "L_sp_521_num_bits_17_2_2_%=:\n\t"
         "lsr	r3, r1, #8\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -127250,14 +127250,14 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_2_1_%=: \n\t"
+    "L_sp_521_num_bits_17_2_1_%=:\n\t"
         "and	r3, r1, #0xff\n\t"
         "mov	r2, #0x40\n\t"
         "ldrb	r12, [lr, r3]\n\t"
         "add	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_2_%=: \n\t"
+    "L_sp_521_num_bits_17_2_%=:\n\t"
         "ldr	r1, [%[a], #4]\n\t"
         "cmp	r1, #0\n\t"
         "beq	L_sp_521_num_bits_17_1_%=\n\t"
@@ -127269,7 +127269,7 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_1_3_%=: \n\t"
+    "L_sp_521_num_bits_17_1_3_%=:\n\t"
         "lsr	r3, r1, #16\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -127279,7 +127279,7 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_1_2_%=: \n\t"
+    "L_sp_521_num_bits_17_1_2_%=:\n\t"
         "lsr	r3, r1, #8\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -127289,14 +127289,14 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_1_1_%=: \n\t"
+    "L_sp_521_num_bits_17_1_1_%=:\n\t"
         "and	r3, r1, #0xff\n\t"
         "mov	r2, #32\n\t"
         "ldrb	r12, [lr, r3]\n\t"
         "add	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_1_%=: \n\t"
+    "L_sp_521_num_bits_17_1_%=:\n\t"
         "ldr	r1, [%[a]]\n\t"
         "lsr	r3, r1, #24\n\t"
         "cmp	r3, #0\n\t"
@@ -127306,7 +127306,7 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_0_3_%=: \n\t"
+    "L_sp_521_num_bits_17_0_3_%=:\n\t"
         "lsr	r3, r1, #16\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -127316,7 +127316,7 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_0_2_%=: \n\t"
+    "L_sp_521_num_bits_17_0_2_%=:\n\t"
         "lsr	r3, r1, #8\n\t"
         "and	r3, r3, #0xff\n\t"
         "cmp	r3, #0\n\t"
@@ -127326,11 +127326,11 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "add	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_0_1_%=: \n\t"
+    "L_sp_521_num_bits_17_0_1_%=:\n\t"
         "and	r3, r1, #0xff\n\t"
         "ldrb	r12, [lr, r3]\n\t"
         "\n"
-    "L_sp_521_num_bits_17_18_%=: \n\t"
+    "L_sp_521_num_bits_17_18_%=:\n\t"
         "mov	%[a], r12\n\t"
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a),
@@ -127351,10 +127351,10 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
 WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a_p)
 #else
 WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register const sp_digit* a asm ("r0") = (const sp_digit*)a_p;
+    register const sp_digit* a __asm__ ("r0") = (const sp_digit*)a_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -127371,7 +127371,7 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "sub	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_16_%=: \n\t"
+    "L_sp_521_num_bits_17_16_%=:\n\t"
         "ldr	r1, [%[a], #60]\n\t"
         "cmp	r1, #0\n\t"
         "beq	L_sp_521_num_bits_17_15_%=\n\t"
@@ -127380,7 +127380,7 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "sub	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_15_%=: \n\t"
+    "L_sp_521_num_bits_17_15_%=:\n\t"
         "ldr	r1, [%[a], #56]\n\t"
         "cmp	r1, #0\n\t"
         "beq	L_sp_521_num_bits_17_14_%=\n\t"
@@ -127394,7 +127394,7 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "sub	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_14_%=: \n\t"
+    "L_sp_521_num_bits_17_14_%=:\n\t"
         "ldr	r1, [%[a], #52]\n\t"
         "cmp	r1, #0\n\t"
         "beq	L_sp_521_num_bits_17_13_%=\n\t"
@@ -127408,7 +127408,7 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "sub	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_13_%=: \n\t"
+    "L_sp_521_num_bits_17_13_%=:\n\t"
         "ldr	r1, [%[a], #48]\n\t"
         "cmp	r1, #0\n\t"
         "beq	L_sp_521_num_bits_17_12_%=\n\t"
@@ -127422,7 +127422,7 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "sub	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_12_%=: \n\t"
+    "L_sp_521_num_bits_17_12_%=:\n\t"
         "ldr	r1, [%[a], #44]\n\t"
         "cmp	r1, #0\n\t"
         "beq	L_sp_521_num_bits_17_11_%=\n\t"
@@ -127436,7 +127436,7 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "sub	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_11_%=: \n\t"
+    "L_sp_521_num_bits_17_11_%=:\n\t"
         "ldr	r1, [%[a], #40]\n\t"
         "cmp	r1, #0\n\t"
         "beq	L_sp_521_num_bits_17_10_%=\n\t"
@@ -127450,7 +127450,7 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "sub	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_10_%=: \n\t"
+    "L_sp_521_num_bits_17_10_%=:\n\t"
         "ldr	r1, [%[a], #36]\n\t"
         "cmp	r1, #0\n\t"
         "beq	L_sp_521_num_bits_17_9_%=\n\t"
@@ -127464,7 +127464,7 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "sub	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_9_%=: \n\t"
+    "L_sp_521_num_bits_17_9_%=:\n\t"
         "ldr	r1, [%[a], #32]\n\t"
         "cmp	r1, #0\n\t"
         "beq	L_sp_521_num_bits_17_8_%=\n\t"
@@ -127478,7 +127478,7 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "sub	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_8_%=: \n\t"
+    "L_sp_521_num_bits_17_8_%=:\n\t"
         "ldr	r1, [%[a], #28]\n\t"
         "cmp	r1, #0\n\t"
         "beq	L_sp_521_num_bits_17_7_%=\n\t"
@@ -127487,7 +127487,7 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "sub	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_7_%=: \n\t"
+    "L_sp_521_num_bits_17_7_%=:\n\t"
         "ldr	r1, [%[a], #24]\n\t"
         "cmp	r1, #0\n\t"
         "beq	L_sp_521_num_bits_17_6_%=\n\t"
@@ -127496,7 +127496,7 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "sub	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_6_%=: \n\t"
+    "L_sp_521_num_bits_17_6_%=:\n\t"
         "ldr	r1, [%[a], #20]\n\t"
         "cmp	r1, #0\n\t"
         "beq	L_sp_521_num_bits_17_5_%=\n\t"
@@ -127505,7 +127505,7 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "sub	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_5_%=: \n\t"
+    "L_sp_521_num_bits_17_5_%=:\n\t"
         "ldr	r1, [%[a], #16]\n\t"
         "cmp	r1, #0\n\t"
         "beq	L_sp_521_num_bits_17_4_%=\n\t"
@@ -127514,7 +127514,7 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "sub	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_4_%=: \n\t"
+    "L_sp_521_num_bits_17_4_%=:\n\t"
         "ldr	r1, [%[a], #12]\n\t"
         "cmp	r1, #0\n\t"
         "beq	L_sp_521_num_bits_17_3_%=\n\t"
@@ -127523,7 +127523,7 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "sub	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_3_%=: \n\t"
+    "L_sp_521_num_bits_17_3_%=:\n\t"
         "ldr	r1, [%[a], #8]\n\t"
         "cmp	r1, #0\n\t"
         "beq	L_sp_521_num_bits_17_2_%=\n\t"
@@ -127532,7 +127532,7 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "sub	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_2_%=: \n\t"
+    "L_sp_521_num_bits_17_2_%=:\n\t"
         "ldr	r1, [%[a], #4]\n\t"
         "cmp	r1, #0\n\t"
         "beq	L_sp_521_num_bits_17_1_%=\n\t"
@@ -127541,13 +127541,13 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
         "sub	r12, r2, r12\n\t"
         "b	L_sp_521_num_bits_17_18_%=\n\t"
         "\n"
-    "L_sp_521_num_bits_17_1_%=: \n\t"
+    "L_sp_521_num_bits_17_1_%=:\n\t"
         "ldr	r1, [%[a]]\n\t"
         "mov	r2, #32\n\t"
         "clz	r12, r1\n\t"
         "sub	r12, r2, r12\n\t"
         "\n"
-    "L_sp_521_num_bits_17_18_%=: \n\t"
+    "L_sp_521_num_bits_17_18_%=:\n\t"
         "mov	%[a], r12\n\t"
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a)
@@ -128430,12 +128430,12 @@ WC_OMIT_FRAME_POINTER static void sp_1024_mul_16(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static void sp_1024_mul_16(sp_digit* r, const sp_digit* a,
     const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -138200,11 +138200,11 @@ WC_OMIT_FRAME_POINTER static void sp_1024_sqr_16(sp_digit* r_p,
     const sp_digit* a_p)
 #else
 WC_OMIT_FRAME_POINTER static void sp_1024_sqr_16(sp_digit* r, const sp_digit* a)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -143447,12 +143447,12 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_1024_add_16(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_1024_add_16(sp_digit* r,
     const sp_digit* a, const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -143509,11 +143509,11 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_1024_sub_in_place_32(sp_digit* a_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_1024_sub_in_place_32(sp_digit* a,
     const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* a asm ("r0") = (sp_digit*)a_p;
-    register const sp_digit* b asm ("r1") = (const sp_digit*)b_p;
+    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r1") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -143598,12 +143598,12 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_1024_add_32(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_1024_add_32(sp_digit* r,
     const sp_digit* a, const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -143758,12 +143758,12 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_1024_sub_16(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_1024_sub_16(sp_digit* r,
     const sp_digit* a, const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -143857,12 +143857,12 @@ WC_OMIT_FRAME_POINTER static void sp_1024_mul_32(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static void sp_1024_mul_32(sp_digit* r, const sp_digit* a,
     const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -143875,13 +143875,13 @@ WC_OMIT_FRAME_POINTER static void sp_1024_mul_32(sp_digit* r, const sp_digit* a,
         "mov	r8, #0\n\t"
         "mov	r5, #4\n\t"
         "\n"
-    "L_sp_1024_mul_32_outer_%=: \n\t"
+    "L_sp_1024_mul_32_outer_%=:\n\t"
         "subs	r3, r5, #0x7c\n\t"
         "it	cc\n\t"
         "movcc	r3, #0\n\t"
         "sub	r4, r5, r3\n\t"
         "\n"
-    "L_sp_1024_mul_32_inner_%=: \n\t"
+    "L_sp_1024_mul_32_inner_%=:\n\t"
         "ldr	lr, [%[a], r3]\n\t"
         "ldr	r11, [%[b], r4]\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
@@ -143999,7 +143999,7 @@ WC_OMIT_FRAME_POINTER static void sp_1024_mul_32(sp_digit* r, const sp_digit* a,
         "adc	r8, r8, #0\n\t"
 #endif
         "\n"
-    "L_sp_1024_mul_32_inner_done_%=: \n\t"
+    "L_sp_1024_mul_32_inner_done_%=:\n\t"
         "str	r6, [sp, r5]\n\t"
         "mov	r6, r7\n\t"
         "mov	r7, r8\n\t"
@@ -144041,7 +144041,7 @@ WC_OMIT_FRAME_POINTER static void sp_1024_mul_32(sp_digit* r, const sp_digit* a,
         "add	r5, r5, #4\n\t"
         "str	r7, [sp, r5]\n\t"
         "\n"
-    "L_sp_1024_mul_32_store_%=: \n\t"
+    "L_sp_1024_mul_32_store_%=:\n\t"
         "ldm	sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
         "stm	%[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
         "subs	r5, r5, #32\n\t"
@@ -144068,11 +144068,11 @@ WC_OMIT_FRAME_POINTER static void sp_1024_sqr_32(sp_digit* r_p,
     const sp_digit* a_p)
 #else
 WC_OMIT_FRAME_POINTER static void sp_1024_sqr_32(sp_digit* r, const sp_digit* a)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -144084,13 +144084,13 @@ WC_OMIT_FRAME_POINTER static void sp_1024_sqr_32(sp_digit* r, const sp_digit* a)
         "mov	r8, #0\n\t"
         "mov	r5, #4\n\t"
         "\n"
-    "L_sp_1024_sqr_32_outer_%=: \n\t"
+    "L_sp_1024_sqr_32_outer_%=:\n\t"
         "subs	r3, r5, #0x7c\n\t"
         "it	cc\n\t"
         "movcc	r3, #0\n\t"
         "sub	r4, r5, r3\n\t"
         "\n"
-    "L_sp_1024_sqr_32_inner_%=: \n\t"
+    "L_sp_1024_sqr_32_inner_%=:\n\t"
         "ldr	lr, [%[a], r3]\n\t"
         "ldr	r11, [%[a], r4]\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
@@ -144175,7 +144175,7 @@ WC_OMIT_FRAME_POINTER static void sp_1024_sqr_32(sp_digit* r, const sp_digit* a)
         "adc	r8, r8, #0\n\t"
 #endif
         "\n"
-    "L_sp_1024_sqr_32_inner_done_%=: \n\t"
+    "L_sp_1024_sqr_32_inner_done_%=:\n\t"
         "str	r6, [sp, r5]\n\t"
         "mov	r6, r7\n\t"
         "mov	r7, r8\n\t"
@@ -144211,7 +144211,7 @@ WC_OMIT_FRAME_POINTER static void sp_1024_sqr_32(sp_digit* r, const sp_digit* a)
         "add	r5, r5, #4\n\t"
         "str	r7, [sp, r5]\n\t"
         "\n"
-    "L_sp_1024_sqr_32_store_%=: \n\t"
+    "L_sp_1024_sqr_32_store_%=:\n\t"
         "ldm	sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
         "stm	%[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
         "subs	r5, r5, #32\n\t"
@@ -144326,18 +144326,18 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_1024_sub_in_place_32(sp_digit* a_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_1024_sub_in_place_32(sp_digit* a,
     const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* a asm ("r0") = (sp_digit*)a_p;
-    register const sp_digit* b asm ("r1") = (const sp_digit*)b_p;
+    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r1") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
         "mov	r12, #0\n\t"
         "add	lr, %[a], #0x80\n\t"
         "\n"
-    "L_sp_1024_sub_in_place_32_word_%=: \n\t"
+    "L_sp_1024_sub_in_place_32_word_%=:\n\t"
         "rsbs	r12, r12, #0\n\t"
         "ldm	%[a], {r2, r3, r4, r5}\n\t"
         "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
@@ -144379,13 +144379,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_1024_cond_sub_32(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_1024_cond_sub_32(sp_digit* r,
     const sp_digit* a, const sp_digit* b, sp_digit m)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
-    register sp_digit m asm ("r3") = (sp_digit)m_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
+    register sp_digit m __asm__ ("r3") = (sp_digit)m_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -144393,7 +144393,7 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_1024_cond_sub_32(sp_digit* r,
         "mov	r12, #0\n\t"
         "mov	lr, #0\n\t"
         "\n"
-    "L_sp_1024_cond_sub_32_words_%=: \n\t"
+    "L_sp_1024_cond_sub_32_words_%=:\n\t"
         "subs	r12, r6, r12\n\t"
         "ldr	r4, [%[a], lr]\n\t"
         "ldr	r5, [%[b], lr]\n\t"
@@ -144432,13 +144432,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_1024_cond_sub_32(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_1024_cond_sub_32(sp_digit* r,
     const sp_digit* a, const sp_digit* b, sp_digit m)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
-    register sp_digit m asm ("r3") = (sp_digit)m_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
+    register sp_digit m __asm__ ("r3") = (sp_digit)m_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -144582,19 +144582,19 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_1024_add_32(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_1024_add_32(sp_digit* r,
     const sp_digit* a, const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
         "mov	r3, #0\n\t"
         "add	r12, %[a], #0x80\n\t"
         "\n"
-    "L_sp_1024_add_32_word_%=: \n\t"
+    "L_sp_1024_add_32_word_%=:\n\t"
         "adds	r3, r3, #-1\n\t"
         "ldm	%[a]!, {r4, r5, r6, r7}\n\t"
         "ldm	%[b]!, {r8, r9, r10, r11}\n\t"
@@ -144635,12 +144635,12 @@ WC_OMIT_FRAME_POINTER static void sp_1024_mul_d_32(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static void sp_1024_mul_d_32(sp_digit* r,
     const sp_digit* a, sp_digit b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register sp_digit b asm ("r2") = (sp_digit)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register sp_digit b __asm__ ("r2") = (sp_digit)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -144676,7 +144676,7 @@ WC_OMIT_FRAME_POINTER static void sp_1024_mul_d_32(sp_digit* r,
         "mov	r5, #0\n\t"
         "mov	r9, #4\n\t"
         "\n"
-    "L_sp_1024_mul_d_32_word_%=: \n\t"
+    "L_sp_1024_mul_d_32_word_%=:\n\t"
         /* A[i] * B */
         "ldr	r8, [%[a], r9]\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
@@ -144746,12 +144746,12 @@ WC_OMIT_FRAME_POINTER static void sp_1024_mul_d_32(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static void sp_1024_mul_d_32(sp_digit* r,
     const sp_digit* a, sp_digit b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register sp_digit b asm ("r2") = (sp_digit)b_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register sp_digit b __asm__ ("r2") = (sp_digit)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -145804,12 +145804,12 @@ WC_OMIT_FRAME_POINTER static sp_digit div_1024_word_32(sp_digit d1_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit div_1024_word_32(sp_digit d1, sp_digit d0,
     sp_digit div)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit d1 asm ("r0") = (sp_digit)d1_p;
-    register sp_digit d0 asm ("r1") = (sp_digit)d0_p;
-    register sp_digit div asm ("r2") = (sp_digit)div_p;
+    register sp_digit d1 __asm__ ("r0") = (sp_digit)d1_p;
+    register sp_digit d0 __asm__ ("r1") = (sp_digit)d0_p;
+    register sp_digit div __asm__ ("r2") = (sp_digit)div_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -145876,12 +145876,12 @@ WC_OMIT_FRAME_POINTER static sp_digit div_1024_word_32(sp_digit d1_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit div_1024_word_32(sp_digit d1, sp_digit d0,
     sp_digit div)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit d1 asm ("r0") = (sp_digit)d1_p;
-    register sp_digit d0 asm ("r1") = (sp_digit)d0_p;
-    register sp_digit div asm ("r2") = (sp_digit)div_p;
+    register sp_digit d1 __asm__ ("r0") = (sp_digit)d1_p;
+    register sp_digit d0 __asm__ ("r1") = (sp_digit)d0_p;
+    register sp_digit div __asm__ ("r2") = (sp_digit)div_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -145899,7 +145899,7 @@ WC_OMIT_FRAME_POINTER static sp_digit div_1024_word_32(sp_digit d1, sp_digit d0,
         /* Next 30 bits */
         "mov	r12, #29\n\t"
         "\n"
-    "L_div_1024_word_32_bit_%=: \n\t"
+    "L_div_1024_word_32_bit_%=:\n\t"
         "lsls	r4, r4, #1\n\t"
         "adc	r5, r5, r5\n\t"
         "subs	r6, lr, r5\n\t"
@@ -146055,11 +146055,11 @@ WC_OMIT_FRAME_POINTER static sp_int32 sp_1024_cmp_32(const sp_digit* a_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_int32 sp_1024_cmp_32(const sp_digit* a,
     const sp_digit* b)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register const sp_digit* a asm ("r0") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r1") = (const sp_digit*)b_p;
+    register const sp_digit* a __asm__ ("r0") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r1") = (const sp_digit*)b_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -146070,7 +146070,7 @@ WC_OMIT_FRAME_POINTER static sp_int32 sp_1024_cmp_32(const sp_digit* a,
 #ifdef WOLFSSL_SP_SMALL
         "mov	r4, #0x7c\n\t"
         "\n"
-    "L_sp_1024_cmp_32_words_%=: \n\t"
+    "L_sp_1024_cmp_32_words_%=:\n\t"
         "ldr	r12, [%[a], r4]\n\t"
         "ldr	lr, [%[b], r4]\n\t"
         "and	r12, r12, r3\n\t"
@@ -146795,12 +146795,12 @@ WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_1024_mont_reduce_32(
 #else
 WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_1024_mont_reduce_32(
     sp_digit* a, const sp_digit* m, sp_digit mp)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* a asm ("r0") = (sp_digit*)a_p;
-    register const sp_digit* m asm ("r1") = (const sp_digit*)m_p;
-    register sp_digit mp asm ("r2") = (sp_digit)mp_p;
+    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
+    register const sp_digit* m __asm__ ("r1") = (const sp_digit*)m_p;
+    register sp_digit mp __asm__ ("r2") = (sp_digit)mp_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -146813,7 +146813,7 @@ WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_1024_mont_reduce_32(
         "ldr	r12, [%[a]]\n\t"
         "ldr	lr, [%[a], #4]\n\t"
         "\n"
-    "L_sp_1024_mont_reduce_32_word_%=: \n\t"
+    "L_sp_1024_mont_reduce_32_word_%=:\n\t"
         /* mu = a[i] * mp */
         "mul	r8, %[mp], r12\n\t"
         /* a[i+0] += m[0] * mu */
@@ -147794,12 +147794,12 @@ WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_1024_mont_reduce_32(
 #else
 WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_1024_mont_reduce_32(
     sp_digit* a, const sp_digit* m, sp_digit mp)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* a asm ("r0") = (sp_digit*)a_p;
-    register const sp_digit* m asm ("r1") = (const sp_digit*)m_p;
-    register sp_digit mp asm ("r2") = (sp_digit)mp_p;
+    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
+    register const sp_digit* m __asm__ ("r1") = (const sp_digit*)m_p;
+    register sp_digit mp __asm__ ("r2") = (sp_digit)mp_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -147810,7 +147810,7 @@ WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_1024_mont_reduce_32(
         "ldr	r12, [%[a]]\n\t"
         "ldr	lr, [%[a], #4]\n\t"
         "\n"
-    "L_sp_1024_mont_reduce_32_word_%=: \n\t"
+    "L_sp_1024_mont_reduce_32_word_%=:\n\t"
         /* mu = a[i] * mp */
         "mul	r8, %[mp], r12\n\t"
         /* a[i+0] += m[0] * mu */
@@ -148108,12 +148108,12 @@ WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_1024_mont_reduce_32(
 #else
 WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_1024_mont_reduce_32(
     sp_digit* a, const sp_digit* m, sp_digit mp)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* a asm ("r0") = (sp_digit*)a_p;
-    register const sp_digit* m asm ("r1") = (const sp_digit*)m_p;
-    register sp_digit mp asm ("r2") = (sp_digit)mp_p;
+    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
+    register const sp_digit* m __asm__ ("r1") = (const sp_digit*)m_p;
+    register sp_digit mp __asm__ ("r2") = (sp_digit)mp_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -148126,7 +148126,7 @@ WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_1024_mont_reduce_32(
         "ldr	r7, [%[a], #12]\n\t"
         "ldr	r8, [%[a], #16]\n\t"
         "\n"
-    "L_sp_1024_mont_reduce_32_word_%=: \n\t"
+    "L_sp_1024_mont_reduce_32_word_%=:\n\t"
         /* mu = a[i] * mp */
         "mul	r11, %[mp], r4\n\t"
         /* a[i+0] += m[0] * mu */
@@ -148470,13 +148470,13 @@ WC_OMIT_FRAME_POINTER static void sp_1024_mont_add_32(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static void sp_1024_mont_add_32(sp_digit* r,
     const sp_digit* a, const sp_digit* b, const sp_digit* m)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
-    register const sp_digit* m asm ("r3") = (const sp_digit*)m_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
+    register const sp_digit* m __asm__ ("r3") = (const sp_digit*)m_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -148656,12 +148656,12 @@ WC_OMIT_FRAME_POINTER static void sp_1024_mont_dbl_32(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static void sp_1024_mont_dbl_32(sp_digit* r,
     const sp_digit* a, const sp_digit* m)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* m asm ("r2") = (const sp_digit*)m_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* m __asm__ ("r2") = (const sp_digit*)m_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -148825,12 +148825,12 @@ WC_OMIT_FRAME_POINTER static void sp_1024_mont_tpl_32(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static void sp_1024_mont_tpl_32(sp_digit* r,
     const sp_digit* a, const sp_digit* m)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* m asm ("r2") = (const sp_digit*)m_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* m __asm__ ("r2") = (const sp_digit*)m_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -149150,13 +149150,13 @@ WC_OMIT_FRAME_POINTER static void sp_1024_mont_sub_32(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static void sp_1024_mont_sub_32(sp_digit* r,
     const sp_digit* a, const sp_digit* b, const sp_digit* m)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
-    register const sp_digit* m asm ("r3") = (const sp_digit*)m_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
+    register const sp_digit* m __asm__ ("r3") = (const sp_digit*)m_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -149333,13 +149333,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_1024_cond_add_32(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_1024_cond_add_32(sp_digit* r,
     const sp_digit* a, const sp_digit* b, sp_digit m)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
-    register sp_digit m asm ("r3") = (sp_digit)m_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
+    register sp_digit m __asm__ ("r3") = (sp_digit)m_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -149347,7 +149347,7 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_1024_cond_add_32(sp_digit* r,
         "mov	r6, #0\n\t"
         "mov	r12, #0\n\t"
         "\n"
-    "L_sp_1024_cond_add_32_words_%=: \n\t"
+    "L_sp_1024_cond_add_32_words_%=:\n\t"
         "adds	lr, lr, #-1\n\t"
         "ldr	r4, [%[a], r12]\n\t"
         "ldr	r5, [%[b], r12]\n\t"
@@ -149386,13 +149386,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_1024_cond_add_32(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static sp_digit sp_1024_cond_add_32(sp_digit* r,
     const sp_digit* a, const sp_digit* b, sp_digit m)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
-    register sp_digit m asm ("r3") = (sp_digit)m_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
+    register sp_digit m __asm__ ("r3") = (sp_digit)m_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
@@ -149529,11 +149529,11 @@ WC_OMIT_FRAME_POINTER static void sp_1024_rshift1_32(sp_digit* r_p,
 #else
 WC_OMIT_FRAME_POINTER static void sp_1024_rshift1_32(sp_digit* r,
     const sp_digit* a)
-#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
+    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
diff --git a/wolfcrypt/src/sp_armthumb.c b/wolfcrypt/src/sp_armthumb.c
index 88604d693c8..b479c93b9bd 100644
--- a/wolfcrypt/src/sp_armthumb.c
+++ b/wolfcrypt/src/sp_armthumb.c
@@ -302,9 +302,10 @@ static void sp_2048_to_bin_64(sp_digit* r, byte* a)
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static void sp_2048_mul_8(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_2048_mul_8(sp_digit* r,
+    const sp_digit* a, const sp_digit* b)
 {
+
     sp_digit t[8 * 2];
     sp_digit* tmp = t;
     __asm__ __volatile__ (
@@ -534,8 +535,10 @@ SP_NOINLINE static void sp_2048_mul_8(sp_digit* r, const sp_digit* a,
         "mov	%[b], r10\n\t"
         : [a] "+l" (a), [b] "+l" (b), [tmp] "+l" (tmp)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
+            "r11", "r12"
     );
+    (void)r;
 
     XMEMCPY(r, t, sizeof(t));
 }
@@ -547,9 +550,10 @@ SP_NOINLINE static void sp_2048_mul_8(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static void sp_2048_mul_8(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_2048_mul_8(sp_digit* r,
+    const sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "sub	sp, sp, #32\n\t"
         "mov	r8, %[r]\n\t"
@@ -9490,7 +9494,7 @@ SP_NOINLINE static void sp_2048_mul_8(sp_digit* r, const sp_digit* a,
         "stm	%[r]!, {r3, r4, r5, r6}\n\t"
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
 }
 
@@ -9501,9 +9505,10 @@ SP_NOINLINE static void sp_2048_mul_8(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static sp_digit sp_2048_add_8(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_2048_add_8(sp_digit* r,
+    const sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "ldm	%[b]!, {r5, r6}\n\t"
         "ldm	%[a]!, {r3, r4}\n\t"
@@ -9581,7 +9586,7 @@ SP_NOINLINE static sp_digit sp_2048_add_8(sp_digit* r, const sp_digit* a,
 #endif
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6"
     );
     return (word32)(size_t)r;
 }
@@ -9592,9 +9597,10 @@ SP_NOINLINE static sp_digit sp_2048_add_8(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static sp_digit sp_2048_add_word_8(sp_digit* r, const sp_digit* a,
-        sp_digit b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_2048_add_word_8(
+    sp_digit* r, const sp_digit* a, sp_digit b)
 {
+
     __asm__ __volatile__ (
         "movs	r5, #0\n\t"
         "ldm	%[a]!, {r3, r4}\n\t"
@@ -9669,7 +9675,7 @@ SP_NOINLINE static sp_digit sp_2048_add_word_8(sp_digit* r, const sp_digit* a,
 #endif
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r3", "r4", "r5", "cc"
+        : "memory", "cc", "r3", "r4", "r5"
     );
     return (word32)(size_t)r;
 }
@@ -9679,9 +9685,10 @@ SP_NOINLINE static sp_digit sp_2048_add_word_8(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static sp_digit sp_2048_sub_in_place_16(sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_2048_sub_in_place_16(
+    sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "ldm	%[b]!, {r4, r5}\n\t"
         "ldr	r2, [%[a]]\n\t"
@@ -9834,7 +9841,7 @@ SP_NOINLINE static sp_digit sp_2048_sub_in_place_16(sp_digit* a,
 #endif
         : [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r2", "r3", "r4", "r5", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5"
     );
     return (word32)(size_t)a;
 }
@@ -9845,9 +9852,10 @@ SP_NOINLINE static sp_digit sp_2048_sub_in_place_16(sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static sp_digit sp_2048_add_16(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_2048_add_16(sp_digit* r,
+    const sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "ldm	%[b]!, {r5, r6}\n\t"
         "ldm	%[a]!, {r3, r4}\n\t"
@@ -9993,7 +10001,7 @@ SP_NOINLINE static sp_digit sp_2048_add_16(sp_digit* r, const sp_digit* a,
 #endif
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6"
     );
     return (word32)(size_t)r;
 }
@@ -10067,9 +10075,10 @@ SP_NOINLINE static void sp_2048_mul_16(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static sp_digit sp_2048_add_word_16(sp_digit* r, const sp_digit* a,
-        sp_digit b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_2048_add_word_16(
+    sp_digit* r, const sp_digit* a, sp_digit b)
 {
+
     __asm__ __volatile__ (
         "movs	r5, #0\n\t"
         "ldm	%[a]!, {r3, r4}\n\t"
@@ -10208,7 +10217,7 @@ SP_NOINLINE static sp_digit sp_2048_add_word_16(sp_digit* r, const sp_digit* a,
 #endif
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r3", "r4", "r5", "cc"
+        : "memory", "cc", "r3", "r4", "r5"
     );
     return (word32)(size_t)r;
 }
@@ -10218,9 +10227,10 @@ SP_NOINLINE static sp_digit sp_2048_add_word_16(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static sp_digit sp_2048_sub_in_place_32(sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_2048_sub_in_place_32(
+    sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "ldm	%[b]!, {r4, r5}\n\t"
         "ldr	r2, [%[a]]\n\t"
@@ -10517,7 +10527,7 @@ SP_NOINLINE static sp_digit sp_2048_sub_in_place_32(sp_digit* a,
 #endif
         : [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r2", "r3", "r4", "r5", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5"
     );
     return (word32)(size_t)a;
 }
@@ -10528,9 +10538,10 @@ SP_NOINLINE static sp_digit sp_2048_sub_in_place_32(sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static sp_digit sp_2048_add_32(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_2048_add_32(sp_digit* r,
+    const sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "ldm	%[b]!, {r5, r6}\n\t"
         "ldm	%[a]!, {r3, r4}\n\t"
@@ -10812,7 +10823,7 @@ SP_NOINLINE static sp_digit sp_2048_add_32(sp_digit* r, const sp_digit* a,
 #endif
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6"
     );
     return (word32)(size_t)r;
 }
@@ -10890,9 +10901,10 @@ SP_NOINLINE static void sp_2048_mul_32(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static sp_digit sp_2048_add_word_32(sp_digit* r, const sp_digit* a,
-        sp_digit b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_2048_add_word_32(
+    sp_digit* r, const sp_digit* a, sp_digit b)
 {
+
     __asm__ __volatile__ (
         "movs	r5, #0\n\t"
         "ldm	%[a]!, {r3, r4}\n\t"
@@ -11159,7 +11171,7 @@ SP_NOINLINE static sp_digit sp_2048_add_word_32(sp_digit* r, const sp_digit* a,
 #endif
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r3", "r4", "r5", "cc"
+        : "memory", "cc", "r3", "r4", "r5"
     );
     return (word32)(size_t)r;
 }
@@ -11169,9 +11181,10 @@ SP_NOINLINE static sp_digit sp_2048_add_word_32(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static sp_digit sp_2048_sub_in_place_64(sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_2048_sub_in_place_64(
+    sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "ldm	%[b]!, {r4, r5}\n\t"
         "ldr	r2, [%[a]]\n\t"
@@ -11756,7 +11769,7 @@ SP_NOINLINE static sp_digit sp_2048_sub_in_place_64(sp_digit* a,
 #endif
         : [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r2", "r3", "r4", "r5", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5"
     );
     return (word32)(size_t)a;
 }
@@ -11767,9 +11780,10 @@ SP_NOINLINE static sp_digit sp_2048_sub_in_place_64(sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static sp_digit sp_2048_add_64(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_2048_add_64(sp_digit* r,
+    const sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "ldm	%[b]!, {r5, r6}\n\t"
         "ldm	%[a]!, {r3, r4}\n\t"
@@ -12323,7 +12337,7 @@ SP_NOINLINE static sp_digit sp_2048_add_64(sp_digit* r, const sp_digit* a,
 #endif
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6"
     );
     return (word32)(size_t)r;
 }
@@ -12401,8 +12415,10 @@ SP_NOINLINE static void sp_2048_mul_64(sp_digit* r, const sp_digit* a,
  * r  A single precision integer.
  * a  A single precision integer.
  */
-SP_NOINLINE static void sp_2048_sqr_8(sp_digit* r, const sp_digit* a)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_2048_sqr_8(sp_digit* r,
+    const sp_digit* a)
 {
+
     __asm__ __volatile__ (
         "movs	r3, #0\n\t"
         "movs	r4, #0\n\t"
@@ -12815,7 +12831,8 @@ SP_NOINLINE static void sp_2048_sqr_8(sp_digit* r, const sp_digit* a)
         "add	sp, sp, r6\n\t"
         : [r] "+l" (r), [a] "+l" (a)
         :
-        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
+            "r11", "r12"
     );
 }
 
@@ -12825,8 +12842,10 @@ SP_NOINLINE static void sp_2048_sqr_8(sp_digit* r, const sp_digit* a)
  * r  A single precision integer.
  * a  A single precision integer.
  */
-SP_NOINLINE static void sp_2048_sqr_8(sp_digit* r, const sp_digit* a)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_2048_sqr_8(sp_digit* r,
+    const sp_digit* a)
 {
+
     __asm__ __volatile__ (
         "sub	sp, sp, #32\n\t"
         "mov	r8, %[r]\n\t"
@@ -19330,7 +19349,8 @@ SP_NOINLINE static void sp_2048_sqr_8(sp_digit* r, const sp_digit* a)
         "stm	%[r]!, {r2, r3, r4, r5}\n\t"
         : [r] "+l" (r), [a] "+l" (a)
         :
-        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
+            "r11", "r12", "lr"
     );
 }
 
@@ -19341,9 +19361,10 @@ SP_NOINLINE static void sp_2048_sqr_8(sp_digit* r, const sp_digit* a)
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static sp_digit sp_2048_sub_8(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_2048_sub_8(sp_digit* r,
+    const sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "ldm	%[b]!, {r5, r6}\n\t"
         "ldm	%[a]!, {r3, r4}\n\t"
@@ -19420,7 +19441,7 @@ SP_NOINLINE static sp_digit sp_2048_sub_8(sp_digit* r, const sp_digit* a,
 #endif
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6"
     );
     return (word32)(size_t)r;
 }
@@ -19466,9 +19487,10 @@ SP_NOINLINE static void sp_2048_sqr_16(sp_digit* r, const sp_digit* a)
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static sp_digit sp_2048_sub_16(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_2048_sub_16(sp_digit* r,
+    const sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "ldm	%[b]!, {r5, r6}\n\t"
         "ldm	%[a]!, {r3, r4}\n\t"
@@ -19613,7 +19635,7 @@ SP_NOINLINE static sp_digit sp_2048_sub_16(sp_digit* r, const sp_digit* a,
 #endif
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6"
     );
     return (word32)(size_t)r;
 }
@@ -19659,9 +19681,10 @@ SP_NOINLINE static void sp_2048_sqr_32(sp_digit* r, const sp_digit* a)
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static sp_digit sp_2048_sub_32(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_2048_sub_32(sp_digit* r,
+    const sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "ldm	%[b]!, {r5, r6}\n\t"
         "ldm	%[a]!, {r3, r4}\n\t"
@@ -19942,7 +19965,7 @@ SP_NOINLINE static sp_digit sp_2048_sub_32(sp_digit* r, const sp_digit* a,
 #endif
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6"
     );
     return (word32)(size_t)r;
 }
@@ -19990,9 +20013,10 @@ SP_NOINLINE static void sp_2048_sqr_64(sp_digit* r, const sp_digit* a)
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static sp_digit sp_2048_add_64(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_2048_add_64(sp_digit* r,
+    const sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "movs	r6, %[a]\n\t"
         "movs	r7, #0\n\t"
@@ -20058,7 +20082,7 @@ SP_NOINLINE static sp_digit sp_2048_add_64(sp_digit* r, const sp_digit* a,
         "movs	%[r], r3\n\t"
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7"
     );
     return (word32)(size_t)r;
 }
@@ -20070,9 +20094,10 @@ SP_NOINLINE static sp_digit sp_2048_add_64(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static sp_digit sp_2048_sub_in_place_64(sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_2048_sub_in_place_64(
+    sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "movs	r7, %[a]\n\t"
         "movs	r2, #0\n\t"
@@ -20137,7 +20162,7 @@ SP_NOINLINE static sp_digit sp_2048_sub_in_place_64(sp_digit* a,
         "movs	%[a], r2\n\t"
         : [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7"
     );
     return (word32)(size_t)a;
 }
@@ -20150,9 +20175,10 @@ SP_NOINLINE static sp_digit sp_2048_sub_in_place_64(sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static void sp_2048_mul_64(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_2048_mul_64(sp_digit* r,
+    const sp_digit* a, const sp_digit* b)
 {
+
     sp_digit t[64 * 2];
     sp_digit* tmp = t;
     __asm__ __volatile__ (
@@ -20392,8 +20418,10 @@ SP_NOINLINE static void sp_2048_mul_64(sp_digit* r, const sp_digit* a,
         "mov	%[b], r10\n\t"
         : [a] "+l" (a), [b] "+l" (b), [tmp] "+l" (tmp)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
+            "r11", "r12"
     );
+    (void)r;
 
     XMEMCPY(r, t, sizeof(t));
 }
@@ -20403,8 +20431,10 @@ SP_NOINLINE static void sp_2048_mul_64(sp_digit* r, const sp_digit* a,
  * r  A single precision integer.
  * a  A single precision integer.
  */
-SP_NOINLINE static void sp_2048_sqr_64(sp_digit* r, const sp_digit* a)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_2048_sqr_64(sp_digit* r,
+    const sp_digit* a)
 {
+
     __asm__ __volatile__ (
         "movs	r3, #0\n\t"
         "movs	r4, #0\n\t"
@@ -20842,7 +20872,8 @@ SP_NOINLINE static void sp_2048_sqr_64(sp_digit* r, const sp_digit* a)
         "add	sp, sp, r6\n\t"
         : [r] "+l" (r), [a] "+l" (a)
         :
-        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
+            "r11", "r12"
     );
 }
 
@@ -20872,9 +20903,10 @@ static void sp_2048_mask_32(sp_digit* r, const sp_digit* a, sp_digit m)
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static sp_digit sp_2048_add_32(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_2048_add_32(sp_digit* r,
+    const sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "movs	r6, %[a]\n\t"
         "movs	r7, #0\n\t"
@@ -20934,7 +20966,7 @@ SP_NOINLINE static sp_digit sp_2048_add_32(sp_digit* r, const sp_digit* a,
         "movs	%[r], r3\n\t"
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7"
     );
     return (word32)(size_t)r;
 }
@@ -20946,9 +20978,10 @@ SP_NOINLINE static sp_digit sp_2048_add_32(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static sp_digit sp_2048_sub_in_place_32(sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_2048_sub_in_place_32(
+    sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "movs	r7, %[a]\n\t"
         "movs	r2, #0\n\t"
@@ -21007,7 +21040,7 @@ SP_NOINLINE static sp_digit sp_2048_sub_in_place_32(sp_digit* a,
         "movs	%[a], r2\n\t"
         : [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7"
     );
     return (word32)(size_t)a;
 }
@@ -21020,9 +21053,10 @@ SP_NOINLINE static sp_digit sp_2048_sub_in_place_32(sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static void sp_2048_mul_32(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_2048_mul_32(sp_digit* r,
+    const sp_digit* a, const sp_digit* b)
 {
+
     sp_digit t[32 * 2];
     sp_digit* tmp = t;
     __asm__ __volatile__ (
@@ -21252,8 +21286,10 @@ SP_NOINLINE static void sp_2048_mul_32(sp_digit* r, const sp_digit* a,
         "mov	%[b], r10\n\t"
         : [a] "+l" (a), [b] "+l" (b), [tmp] "+l" (tmp)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
+            "r11", "r12"
     );
+    (void)r;
 
     XMEMCPY(r, t, sizeof(t));
 }
@@ -21263,8 +21299,10 @@ SP_NOINLINE static void sp_2048_mul_32(sp_digit* r, const sp_digit* a,
  * r  A single precision integer.
  * a  A single precision integer.
  */
-SP_NOINLINE static void sp_2048_sqr_32(sp_digit* r, const sp_digit* a)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_2048_sqr_32(sp_digit* r,
+    const sp_digit* a)
 {
+
     __asm__ __volatile__ (
         "movs	r3, #0\n\t"
         "movs	r4, #0\n\t"
@@ -21687,7 +21725,8 @@ SP_NOINLINE static void sp_2048_sqr_32(sp_digit* r, const sp_digit* a)
         "add	sp, sp, r6\n\t"
         : [r] "+l" (r), [a] "+l" (a)
         :
-        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
+            "r11", "r12"
     );
 }
 
@@ -21720,9 +21759,10 @@ static void sp_2048_mont_setup(const sp_digit* a, sp_digit* rho)
  * a  A single precision integer.
  * b  A single precision digit.
  */
-SP_NOINLINE static void sp_2048_mul_d_64(sp_digit* r, const sp_digit* a,
-        sp_digit b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_2048_mul_d_64(sp_digit* r,
+    const sp_digit* a, sp_digit b)
 {
+
     __asm__ __volatile__ (
         "movs	r6, #0xff\n\t"
 #if defined(__clang__) || defined(WOLFSSL_KEIL)
@@ -21901,7 +21941,7 @@ SP_NOINLINE static void sp_2048_mul_d_64(sp_digit* r, const sp_digit* a,
         "str	r3, [%[r]]\n\t"
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9"
     );
 }
 
@@ -21928,9 +21968,10 @@ static void sp_2048_mont_norm_32(sp_digit* r, const sp_digit* m)
  * b  A single precision number to subtract.
  * m  Mask value to apply.
  */
-SP_NOINLINE static sp_digit sp_2048_cond_sub_32(sp_digit* r, const sp_digit* a,
-        const sp_digit* b, sp_digit m)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_2048_cond_sub_32(
+    sp_digit* r, const sp_digit* a, const sp_digit* b, sp_digit m)
 {
+
     __asm__ __volatile__ (
         "movs	r4, #0\n\t"
         "movs	r5, #0x80\n\t"
@@ -21978,7 +22019,7 @@ SP_NOINLINE static sp_digit sp_2048_cond_sub_32(sp_digit* r, const sp_digit* a,
         "movs	%[r], r4\n\t"
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b), [m] "+l" (m)
         :
-        : "memory", "r4", "r5", "r6", "r7", "r8", "cc"
+        : "memory", "cc", "r4", "r5", "r6", "r7", "r8"
     );
     return (word32)(size_t)r;
 }
@@ -21990,9 +22031,10 @@ SP_NOINLINE static sp_digit sp_2048_cond_sub_32(sp_digit* r, const sp_digit* a,
  * m   The single precision number representing the modulus.
  * mp  The digit representing the negative inverse of m mod 2^n.
  */
-SP_NOINLINE static void sp_2048_mont_reduce_32(sp_digit* a, const sp_digit* m,
-        sp_digit mp)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_2048_mont_reduce_32(
+    sp_digit* a, const sp_digit* m, sp_digit mp)
 {
+
     __asm__ __volatile__ (
         "movs	r7, #0\n\t"
         "mov	r8, %[mp]\n\t"
@@ -23046,7 +23088,8 @@ SP_NOINLINE static void sp_2048_mont_reduce_32(sp_digit* a, const sp_digit* m,
 #endif /* WOLFSSL_SP_LARGE_CODE */
         : [a] "+l" (a), [m] "+l" (m), [mp] "+l" (mp)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
+            "r11", "r12", "lr"
     );
 }
 
@@ -23086,9 +23129,10 @@ SP_NOINLINE static void sp_2048_mont_sqr_32(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision digit.
  */
-SP_NOINLINE static void sp_2048_mul_d_32(sp_digit* r, const sp_digit* a,
-        sp_digit b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_2048_mul_d_32(sp_digit* r,
+    const sp_digit* a, sp_digit b)
 {
+
     __asm__ __volatile__ (
         "movs	r6, #0x80\n\t"
 #if defined(__clang__) || defined(WOLFSSL_KEIL)
@@ -23262,7 +23306,7 @@ SP_NOINLINE static void sp_2048_mul_d_32(sp_digit* r, const sp_digit* a,
         "str	r3, [%[r]]\n\t"
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9"
     );
 }
 
@@ -23275,9 +23319,10 @@ SP_NOINLINE static void sp_2048_mul_d_32(sp_digit* r, const sp_digit* a,
  *
  * Note that this is an approximate div. It may give an answer 1 larger.
  */
-SP_NOINLINE static sp_digit div_2048_word_32(sp_digit d1, sp_digit d0,
-        sp_digit div)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit div_2048_word_32(sp_digit d1,
+    sp_digit d0, sp_digit div)
 {
+
     __asm__ __volatile__ (
         "movs	r3, #0\n\t"
 #if defined(__clang__) || defined(WOLFSSL_KEIL)
@@ -23877,7 +23922,7 @@ SP_NOINLINE static sp_digit div_2048_word_32(sp_digit d1, sp_digit d0,
         "movs	%[d1], r3\n\t"
         : [d1] "+l" (d1), [d0] "+l" (d0), [div] "+l" (div)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9"
     );
     return (word32)(size_t)d1;
 }
@@ -23889,8 +23934,10 @@ SP_NOINLINE static sp_digit div_2048_word_32(sp_digit d1, sp_digit d0,
  * return -ve, 0 or +ve if a is less than, equal to or greater than b
  * respectively.
  */
-SP_NOINLINE static sp_int32 sp_2048_cmp_32(const sp_digit* a, const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_int32 sp_2048_cmp_32(
+    const sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "movs	r2, #0\n\t"
         "movs	r3, #0\n\t"
@@ -23986,7 +24033,7 @@ SP_NOINLINE static sp_int32 sp_2048_cmp_32(const sp_digit* a, const sp_digit* b)
         "movs	%[a], r2\n\t"
         : [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7"
     );
     return (word32)(size_t)a;
 }
@@ -24367,9 +24414,10 @@ static void sp_2048_mont_norm_64(sp_digit* r, const sp_digit* m)
  * b  A single precision number to subtract.
  * m  Mask value to apply.
  */
-SP_NOINLINE static sp_digit sp_2048_cond_sub_64(sp_digit* r, const sp_digit* a,
-        const sp_digit* b, sp_digit m)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_2048_cond_sub_64(
+    sp_digit* r, const sp_digit* a, const sp_digit* b, sp_digit m)
 {
+
     __asm__ __volatile__ (
         "movs	r4, #0\n\t"
         "movs	r5, #0xff\n\t"
@@ -24422,7 +24470,7 @@ SP_NOINLINE static sp_digit sp_2048_cond_sub_64(sp_digit* r, const sp_digit* a,
         "movs	%[r], r4\n\t"
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b), [m] "+l" (m)
         :
-        : "memory", "r4", "r5", "r6", "r7", "r8", "cc"
+        : "memory", "cc", "r4", "r5", "r6", "r7", "r8"
     );
     return (word32)(size_t)r;
 }
@@ -24434,9 +24482,10 @@ SP_NOINLINE static sp_digit sp_2048_cond_sub_64(sp_digit* r, const sp_digit* a,
  * m   The single precision number representing the modulus.
  * mp  The digit representing the negative inverse of m mod 2^n.
  */
-SP_NOINLINE static void sp_2048_mont_reduce_64(sp_digit* a, const sp_digit* m,
-        sp_digit mp)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_2048_mont_reduce_64(
+    sp_digit* a, const sp_digit* m, sp_digit mp)
 {
+
     __asm__ __volatile__ (
         "movs	r7, #0\n\t"
         "mov	r8, %[mp]\n\t"
@@ -26044,7 +26093,8 @@ SP_NOINLINE static void sp_2048_mont_reduce_64(sp_digit* a, const sp_digit* m,
 #endif /* WOLFSSL_SP_LARGE_CODE */
         : [a] "+l" (a), [m] "+l" (m), [mp] "+l" (mp)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
+            "r11", "r12", "lr"
     );
 }
 
@@ -26085,9 +26135,10 @@ SP_NOINLINE static void sp_2048_mont_sqr_64(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static sp_digit sp_2048_sub_64(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_2048_sub_64(sp_digit* r,
+    const sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "movs	r6, %[a]\n\t"
         "movs	r3, #0\n\t"
@@ -26147,7 +26198,7 @@ SP_NOINLINE static sp_digit sp_2048_sub_64(sp_digit* r, const sp_digit* a,
         "movs	%[r], r3\n\t"
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6"
     );
     return (word32)(size_t)r;
 }
@@ -26159,9 +26210,10 @@ SP_NOINLINE static sp_digit sp_2048_sub_64(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static sp_digit sp_2048_sub_64(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_2048_sub_64(sp_digit* r,
+    const sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "ldm	%[b]!, {r5, r6}\n\t"
         "ldm	%[a]!, {r3, r4}\n\t"
@@ -26714,7 +26766,7 @@ SP_NOINLINE static sp_digit sp_2048_sub_64(sp_digit* r, const sp_digit* a,
 #endif
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6"
     );
     return (word32)(size_t)r;
 }
@@ -26729,9 +26781,10 @@ SP_NOINLINE static sp_digit sp_2048_sub_64(sp_digit* r, const sp_digit* a,
  *
  * Note that this is an approximate div. It may give an answer 1 larger.
  */
-SP_NOINLINE static sp_digit div_2048_word_64(sp_digit d1, sp_digit d0,
-        sp_digit div)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit div_2048_word_64(sp_digit d1,
+    sp_digit d0, sp_digit div)
 {
+
     __asm__ __volatile__ (
         "movs	r3, #0\n\t"
 #if defined(__clang__) || defined(WOLFSSL_KEIL)
@@ -27331,7 +27384,7 @@ SP_NOINLINE static sp_digit div_2048_word_64(sp_digit d1, sp_digit d0,
         "movs	%[d1], r3\n\t"
         : [d1] "+l" (d1), [d0] "+l" (d0), [div] "+l" (div)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9"
     );
     return (word32)(size_t)d1;
 }
@@ -27447,8 +27500,10 @@ static void sp_2048_mask_64(sp_digit* r, const sp_digit* a, sp_digit m)
  * return -ve, 0 or +ve if a is less than, equal to or greater than b
  * respectively.
  */
-SP_NOINLINE static sp_int32 sp_2048_cmp_64(const sp_digit* a, const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_int32 sp_2048_cmp_64(
+    const sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "movs	r2, #0\n\t"
         "movs	r3, #0\n\t"
@@ -27544,7 +27599,7 @@ SP_NOINLINE static sp_int32 sp_2048_cmp_64(const sp_digit* a, const sp_digit* b)
         "movs	%[a], r2\n\t"
         : [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7"
     );
     return (word32)(size_t)a;
 }
@@ -28033,9 +28088,10 @@ int sp_RsaPublic_2048(const byte* in, word32 inLen, const mp_int* em,
  * b  A single precision number to add.
  * m  Mask value to apply.
  */
-SP_NOINLINE static sp_digit sp_2048_cond_add_32(sp_digit* r, const sp_digit* a,
-        const sp_digit* b, sp_digit m)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_2048_cond_add_32(
+    sp_digit* r, const sp_digit* a, const sp_digit* b, sp_digit m)
 {
+
     __asm__ __volatile__ (
         "movs	r4, #0\n\t"
         "movs	r5, #0x80\n\t"
@@ -28089,7 +28145,7 @@ SP_NOINLINE static sp_digit sp_2048_cond_add_32(sp_digit* r, const sp_digit* a,
         "movs	%[r], r4\n\t"
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b), [m] "+l" (m)
         :
-        : "memory", "r4", "r5", "r6", "r7", "r8", "cc"
+        : "memory", "cc", "r4", "r5", "r6", "r7", "r8"
     );
     return (word32)(size_t)r;
 }
@@ -28373,8 +28429,10 @@ int sp_ModExp_2048(const mp_int* base, const mp_int* exp, const mp_int* mod,
  * a  A single precision integer.
  * n  Integer representing number of bits to shift.
  */
-static void sp_2048_lshift_64(sp_digit* r, const sp_digit* a, byte n)
+WC_OMIT_FRAME_POINTER static void sp_2048_lshift_64(sp_digit* r,
+    const sp_digit* a, byte n)
 {
+
     __asm__ __volatile__ (
         "movs	r7, #31\n\t"
 #if defined(__clang__) || defined(WOLFSSL_KEIL)
@@ -29954,7 +30012,7 @@ static void sp_2048_lshift_64(sp_digit* r, const sp_digit* a, byte n)
         "str	r5, [%[r], #4]\n\t"
         : [r] "+l" (r), [a] "+l" (a), [n] "+l" (n)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7"
     );
 }
 
@@ -30369,9 +30427,10 @@ static void sp_3072_to_bin_96(sp_digit* r, byte* a)
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static void sp_3072_mul_12(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_3072_mul_12(sp_digit* r,
+    const sp_digit* a, const sp_digit* b)
 {
+
     sp_digit t[12 * 2];
     sp_digit* tmp = t;
     __asm__ __volatile__ (
@@ -30601,8 +30660,10 @@ SP_NOINLINE static void sp_3072_mul_12(sp_digit* r, const sp_digit* a,
         "mov	%[b], r10\n\t"
         : [a] "+l" (a), [b] "+l" (b), [tmp] "+l" (tmp)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
+            "r11", "r12"
     );
+    (void)r;
 
     XMEMCPY(r, t, sizeof(t));
 }
@@ -30614,9 +30675,10 @@ SP_NOINLINE static void sp_3072_mul_12(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static void sp_3072_mul_12(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_3072_mul_12(sp_digit* r,
+    const sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "sub	sp, sp, #48\n\t"
         "mov	r8, %[r]\n\t"
@@ -50783,7 +50845,7 @@ SP_NOINLINE static void sp_3072_mul_12(sp_digit* r, const sp_digit* a,
         "stm	%[r]!, {r3, r4, r5, r6}\n\t"
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
 }
 
@@ -50794,9 +50856,10 @@ SP_NOINLINE static void sp_3072_mul_12(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static sp_digit sp_3072_add_12(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_3072_add_12(sp_digit* r,
+    const sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "ldm	%[b]!, {r5, r6}\n\t"
         "ldm	%[a]!, {r3, r4}\n\t"
@@ -50908,7 +50971,7 @@ SP_NOINLINE static sp_digit sp_3072_add_12(sp_digit* r, const sp_digit* a,
 #endif
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6"
     );
     return (word32)(size_t)r;
 }
@@ -50919,9 +50982,10 @@ SP_NOINLINE static sp_digit sp_3072_add_12(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static sp_digit sp_3072_add_word_12(sp_digit* r, const sp_digit* a,
-        sp_digit b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_3072_add_word_12(
+    sp_digit* r, const sp_digit* a, sp_digit b)
 {
+
     __asm__ __volatile__ (
         "movs	r5, #0\n\t"
         "ldm	%[a]!, {r3, r4}\n\t"
@@ -51028,7 +51092,7 @@ SP_NOINLINE static sp_digit sp_3072_add_word_12(sp_digit* r, const sp_digit* a,
 #endif
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r3", "r4", "r5", "cc"
+        : "memory", "cc", "r3", "r4", "r5"
     );
     return (word32)(size_t)r;
 }
@@ -51038,9 +51102,10 @@ SP_NOINLINE static sp_digit sp_3072_add_word_12(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static sp_digit sp_3072_sub_in_place_24(sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_3072_sub_in_place_24(
+    sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "ldm	%[b]!, {r4, r5}\n\t"
         "ldr	r2, [%[a]]\n\t"
@@ -51265,7 +51330,7 @@ SP_NOINLINE static sp_digit sp_3072_sub_in_place_24(sp_digit* a,
 #endif
         : [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r2", "r3", "r4", "r5", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5"
     );
     return (word32)(size_t)a;
 }
@@ -51276,9 +51341,10 @@ SP_NOINLINE static sp_digit sp_3072_sub_in_place_24(sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static sp_digit sp_3072_add_24(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_3072_add_24(sp_digit* r,
+    const sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "ldm	%[b]!, {r5, r6}\n\t"
         "ldm	%[a]!, {r3, r4}\n\t"
@@ -51492,7 +51558,7 @@ SP_NOINLINE static sp_digit sp_3072_add_24(sp_digit* r, const sp_digit* a,
 #endif
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6"
     );
     return (word32)(size_t)r;
 }
@@ -51570,9 +51636,10 @@ SP_NOINLINE static void sp_3072_mul_24(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static sp_digit sp_3072_add_word_24(sp_digit* r, const sp_digit* a,
-        sp_digit b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_3072_add_word_24(
+    sp_digit* r, const sp_digit* a, sp_digit b)
 {
+
     __asm__ __volatile__ (
         "movs	r5, #0\n\t"
         "ldm	%[a]!, {r3, r4}\n\t"
@@ -51775,7 +51842,7 @@ SP_NOINLINE static sp_digit sp_3072_add_word_24(sp_digit* r, const sp_digit* a,
 #endif
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r3", "r4", "r5", "cc"
+        : "memory", "cc", "r3", "r4", "r5"
     );
     return (word32)(size_t)r;
 }
@@ -51785,9 +51852,10 @@ SP_NOINLINE static sp_digit sp_3072_add_word_24(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static sp_digit sp_3072_sub_in_place_48(sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_3072_sub_in_place_48(
+    sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "ldm	%[b]!, {r4, r5}\n\t"
         "ldr	r2, [%[a]]\n\t"
@@ -52228,7 +52296,7 @@ SP_NOINLINE static sp_digit sp_3072_sub_in_place_48(sp_digit* a,
 #endif
         : [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r2", "r3", "r4", "r5", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5"
     );
     return (word32)(size_t)a;
 }
@@ -52239,9 +52307,10 @@ SP_NOINLINE static sp_digit sp_3072_sub_in_place_48(sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static sp_digit sp_3072_add_48(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_3072_add_48(sp_digit* r,
+    const sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "ldm	%[b]!, {r5, r6}\n\t"
         "ldm	%[a]!, {r3, r4}\n\t"
@@ -52659,7 +52728,7 @@ SP_NOINLINE static sp_digit sp_3072_add_48(sp_digit* r, const sp_digit* a,
 #endif
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6"
     );
     return (word32)(size_t)r;
 }
@@ -52737,9 +52806,10 @@ SP_NOINLINE static void sp_3072_mul_48(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static sp_digit sp_3072_add_word_48(sp_digit* r, const sp_digit* a,
-        sp_digit b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_3072_add_word_48(
+    sp_digit* r, const sp_digit* a, sp_digit b)
 {
+
     __asm__ __volatile__ (
         "movs	r5, #0\n\t"
         "ldm	%[a]!, {r3, r4}\n\t"
@@ -53134,7 +53204,7 @@ SP_NOINLINE static sp_digit sp_3072_add_word_48(sp_digit* r, const sp_digit* a,
 #endif
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r3", "r4", "r5", "cc"
+        : "memory", "cc", "r3", "r4", "r5"
     );
     return (word32)(size_t)r;
 }
@@ -53144,9 +53214,10 @@ SP_NOINLINE static sp_digit sp_3072_add_word_48(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static sp_digit sp_3072_sub_in_place_96(sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_3072_sub_in_place_96(
+    sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "ldm	%[b]!, {r4, r5}\n\t"
         "ldr	r2, [%[a]]\n\t"
@@ -54019,7 +54090,7 @@ SP_NOINLINE static sp_digit sp_3072_sub_in_place_96(sp_digit* a,
 #endif
         : [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r2", "r3", "r4", "r5", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5"
     );
     return (word32)(size_t)a;
 }
@@ -54030,9 +54101,10 @@ SP_NOINLINE static sp_digit sp_3072_sub_in_place_96(sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static sp_digit sp_3072_add_96(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_3072_add_96(sp_digit* r,
+    const sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "ldm	%[b]!, {r5, r6}\n\t"
         "ldm	%[a]!, {r3, r4}\n\t"
@@ -54858,7 +54930,7 @@ SP_NOINLINE static sp_digit sp_3072_add_96(sp_digit* r, const sp_digit* a,
 #endif
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6"
     );
     return (word32)(size_t)r;
 }
@@ -54936,8 +55008,10 @@ SP_NOINLINE static void sp_3072_mul_96(sp_digit* r, const sp_digit* a,
  * r  A single precision integer.
  * a  A single precision integer.
  */
-SP_NOINLINE static void sp_3072_sqr_12(sp_digit* r, const sp_digit* a)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_3072_sqr_12(sp_digit* r,
+    const sp_digit* a)
 {
+
     __asm__ __volatile__ (
         "movs	r3, #0\n\t"
         "movs	r4, #0\n\t"
@@ -55350,7 +55424,8 @@ SP_NOINLINE static void sp_3072_sqr_12(sp_digit* r, const sp_digit* a)
         "add	sp, sp, r6\n\t"
         : [r] "+l" (r), [a] "+l" (a)
         :
-        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
+            "r11", "r12"
     );
 }
 
@@ -55360,8 +55435,10 @@ SP_NOINLINE static void sp_3072_sqr_12(sp_digit* r, const sp_digit* a)
  * r  A single precision integer.
  * a  A single precision integer.
  */
-SP_NOINLINE static void sp_3072_sqr_12(sp_digit* r, const sp_digit* a)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_3072_sqr_12(sp_digit* r,
+    const sp_digit* a)
 {
+
     __asm__ __volatile__ (
         "sub	sp, sp, #48\n\t"
         "mov	r8, %[r]\n\t"
@@ -70121,7 +70198,8 @@ SP_NOINLINE static void sp_3072_sqr_12(sp_digit* r, const sp_digit* a)
         "stm	%[r]!, {r2, r3, r4, r5}\n\t"
         : [r] "+l" (r), [a] "+l" (a)
         :
-        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
+            "r11", "r12", "lr"
     );
 }
 
@@ -70132,9 +70210,10 @@ SP_NOINLINE static void sp_3072_sqr_12(sp_digit* r, const sp_digit* a)
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static sp_digit sp_3072_sub_12(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_3072_sub_12(sp_digit* r,
+    const sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "ldm	%[b]!, {r5, r6}\n\t"
         "ldm	%[a]!, {r3, r4}\n\t"
@@ -70245,7 +70324,7 @@ SP_NOINLINE static sp_digit sp_3072_sub_12(sp_digit* r, const sp_digit* a,
 #endif
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6"
     );
     return (word32)(size_t)r;
 }
@@ -70291,9 +70370,10 @@ SP_NOINLINE static void sp_3072_sqr_24(sp_digit* r, const sp_digit* a)
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static sp_digit sp_3072_sub_24(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_3072_sub_24(sp_digit* r,
+    const sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "ldm	%[b]!, {r5, r6}\n\t"
         "ldm	%[a]!, {r3, r4}\n\t"
@@ -70506,7 +70586,7 @@ SP_NOINLINE static sp_digit sp_3072_sub_24(sp_digit* r, const sp_digit* a,
 #endif
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6"
     );
     return (word32)(size_t)r;
 }
@@ -70552,9 +70632,10 @@ SP_NOINLINE static void sp_3072_sqr_48(sp_digit* r, const sp_digit* a)
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static sp_digit sp_3072_sub_48(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_3072_sub_48(sp_digit* r,
+    const sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "ldm	%[b]!, {r5, r6}\n\t"
         "ldm	%[a]!, {r3, r4}\n\t"
@@ -70971,7 +71052,7 @@ SP_NOINLINE static sp_digit sp_3072_sub_48(sp_digit* r, const sp_digit* a,
 #endif
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6"
     );
     return (word32)(size_t)r;
 }
@@ -71019,9 +71100,10 @@ SP_NOINLINE static void sp_3072_sqr_96(sp_digit* r, const sp_digit* a)
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static sp_digit sp_3072_add_96(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_3072_add_96(sp_digit* r,
+    const sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "movs	r6, %[a]\n\t"
         "movs	r7, #0\n\t"
@@ -71087,7 +71169,7 @@ SP_NOINLINE static sp_digit sp_3072_add_96(sp_digit* r, const sp_digit* a,
         "movs	%[r], r3\n\t"
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7"
     );
     return (word32)(size_t)r;
 }
@@ -71099,9 +71181,10 @@ SP_NOINLINE static sp_digit sp_3072_add_96(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static sp_digit sp_3072_sub_in_place_96(sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_3072_sub_in_place_96(
+    sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "movs	r7, %[a]\n\t"
         "movs	r2, #0\n\t"
@@ -71166,7 +71249,7 @@ SP_NOINLINE static sp_digit sp_3072_sub_in_place_96(sp_digit* a,
         "movs	%[a], r2\n\t"
         : [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7"
     );
     return (word32)(size_t)a;
 }
@@ -71179,9 +71262,10 @@ SP_NOINLINE static sp_digit sp_3072_sub_in_place_96(sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static void sp_3072_mul_96(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_3072_mul_96(sp_digit* r,
+    const sp_digit* a, const sp_digit* b)
 {
+
     sp_digit t[96 * 2];
     sp_digit* tmp = t;
     __asm__ __volatile__ (
@@ -71431,8 +71515,10 @@ SP_NOINLINE static void sp_3072_mul_96(sp_digit* r, const sp_digit* a,
         "mov	%[b], r10\n\t"
         : [a] "+l" (a), [b] "+l" (b), [tmp] "+l" (tmp)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
+            "r11", "r12"
     );
+    (void)r;
 
     XMEMCPY(r, t, sizeof(t));
 }
@@ -71442,8 +71528,10 @@ SP_NOINLINE static void sp_3072_mul_96(sp_digit* r, const sp_digit* a,
  * r  A single precision integer.
  * a  A single precision integer.
  */
-SP_NOINLINE static void sp_3072_sqr_96(sp_digit* r, const sp_digit* a)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_3072_sqr_96(sp_digit* r,
+    const sp_digit* a)
 {
+
     __asm__ __volatile__ (
         "movs	r3, #0\n\t"
         "movs	r4, #0\n\t"
@@ -71896,7 +71984,8 @@ SP_NOINLINE static void sp_3072_sqr_96(sp_digit* r, const sp_digit* a)
         "add	sp, sp, r6\n\t"
         : [r] "+l" (r), [a] "+l" (a)
         :
-        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
+            "r11", "r12"
     );
 }
 
@@ -71926,9 +72015,10 @@ static void sp_3072_mask_48(sp_digit* r, const sp_digit* a, sp_digit m)
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static sp_digit sp_3072_add_48(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_3072_add_48(sp_digit* r,
+    const sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "movs	r6, %[a]\n\t"
         "movs	r7, #0\n\t"
@@ -71988,7 +72078,7 @@ SP_NOINLINE static sp_digit sp_3072_add_48(sp_digit* r, const sp_digit* a,
         "movs	%[r], r3\n\t"
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7"
     );
     return (word32)(size_t)r;
 }
@@ -72000,9 +72090,10 @@ SP_NOINLINE static sp_digit sp_3072_add_48(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static sp_digit sp_3072_sub_in_place_48(sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_3072_sub_in_place_48(
+    sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "movs	r7, %[a]\n\t"
         "movs	r2, #0\n\t"
@@ -72061,7 +72152,7 @@ SP_NOINLINE static sp_digit sp_3072_sub_in_place_48(sp_digit* a,
         "movs	%[a], r2\n\t"
         : [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7"
     );
     return (word32)(size_t)a;
 }
@@ -72074,9 +72165,10 @@ SP_NOINLINE static sp_digit sp_3072_sub_in_place_48(sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static void sp_3072_mul_48(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_3072_mul_48(sp_digit* r,
+    const sp_digit* a, const sp_digit* b)
 {
+
     sp_digit t[48 * 2];
     sp_digit* tmp = t;
     __asm__ __volatile__ (
@@ -72311,8 +72403,10 @@ SP_NOINLINE static void sp_3072_mul_48(sp_digit* r, const sp_digit* a,
         "mov	%[b], r10\n\t"
         : [a] "+l" (a), [b] "+l" (b), [tmp] "+l" (tmp)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
+            "r11", "r12"
     );
+    (void)r;
 
     XMEMCPY(r, t, sizeof(t));
 }
@@ -72322,8 +72416,10 @@ SP_NOINLINE static void sp_3072_mul_48(sp_digit* r, const sp_digit* a,
  * r  A single precision integer.
  * a  A single precision integer.
  */
-SP_NOINLINE static void sp_3072_sqr_48(sp_digit* r, const sp_digit* a)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_3072_sqr_48(sp_digit* r,
+    const sp_digit* a)
 {
+
     __asm__ __volatile__ (
         "movs	r3, #0\n\t"
         "movs	r4, #0\n\t"
@@ -72756,7 +72852,8 @@ SP_NOINLINE static void sp_3072_sqr_48(sp_digit* r, const sp_digit* a)
         "add	sp, sp, r6\n\t"
         : [r] "+l" (r), [a] "+l" (a)
         :
-        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
+            "r11", "r12"
     );
 }
 
@@ -72789,9 +72886,10 @@ static void sp_3072_mont_setup(const sp_digit* a, sp_digit* rho)
  * a  A single precision integer.
  * b  A single precision digit.
  */
-SP_NOINLINE static void sp_3072_mul_d_96(sp_digit* r, const sp_digit* a,
-        sp_digit b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_3072_mul_d_96(sp_digit* r,
+    const sp_digit* a, sp_digit b)
 {
+
     __asm__ __volatile__ (
         "movs	r6, #0xff\n\t"
 #if defined(__clang__) || defined(WOLFSSL_KEIL)
@@ -72970,7 +73068,7 @@ SP_NOINLINE static void sp_3072_mul_d_96(sp_digit* r, const sp_digit* a,
         "str	r3, [%[r]]\n\t"
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9"
     );
 }
 
@@ -72997,9 +73095,10 @@ static void sp_3072_mont_norm_48(sp_digit* r, const sp_digit* m)
  * b  A single precision number to subtract.
  * m  Mask value to apply.
  */
-SP_NOINLINE static sp_digit sp_3072_cond_sub_48(sp_digit* r, const sp_digit* a,
-        const sp_digit* b, sp_digit m)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_3072_cond_sub_48(
+    sp_digit* r, const sp_digit* a, const sp_digit* b, sp_digit m)
 {
+
     __asm__ __volatile__ (
         "movs	r4, #0\n\t"
         "movs	r5, #0xc0\n\t"
@@ -73047,7 +73146,7 @@ SP_NOINLINE static sp_digit sp_3072_cond_sub_48(sp_digit* r, const sp_digit* a,
         "movs	%[r], r4\n\t"
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b), [m] "+l" (m)
         :
-        : "memory", "r4", "r5", "r6", "r7", "r8", "cc"
+        : "memory", "cc", "r4", "r5", "r6", "r7", "r8"
     );
     return (word32)(size_t)r;
 }
@@ -73059,9 +73158,10 @@ SP_NOINLINE static sp_digit sp_3072_cond_sub_48(sp_digit* r, const sp_digit* a,
  * m   The single precision number representing the modulus.
  * mp  The digit representing the negative inverse of m mod 2^n.
  */
-SP_NOINLINE static void sp_3072_mont_reduce_48(sp_digit* a, const sp_digit* m,
-        sp_digit mp)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_3072_mont_reduce_48(
+    sp_digit* a, const sp_digit* m, sp_digit mp)
 {
+
     __asm__ __volatile__ (
         "movs	r7, #0\n\t"
         "mov	r8, %[mp]\n\t"
@@ -74387,7 +74487,8 @@ SP_NOINLINE static void sp_3072_mont_reduce_48(sp_digit* a, const sp_digit* m,
 #endif /* WOLFSSL_SP_LARGE_CODE */
         : [a] "+l" (a), [m] "+l" (m), [mp] "+l" (mp)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
+            "r11", "r12", "lr"
     );
 }
 
@@ -74427,9 +74528,10 @@ SP_NOINLINE static void sp_3072_mont_sqr_48(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision digit.
  */
-SP_NOINLINE static void sp_3072_mul_d_48(sp_digit* r, const sp_digit* a,
-        sp_digit b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_3072_mul_d_48(sp_digit* r,
+    const sp_digit* a, sp_digit b)
 {
+
     __asm__ __volatile__ (
         "movs	r6, #0xc0\n\t"
 #if defined(__clang__) || defined(WOLFSSL_KEIL)
@@ -74603,7 +74705,7 @@ SP_NOINLINE static void sp_3072_mul_d_48(sp_digit* r, const sp_digit* a,
         "str	r3, [%[r]]\n\t"
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9"
     );
 }
 
@@ -74616,9 +74718,10 @@ SP_NOINLINE static void sp_3072_mul_d_48(sp_digit* r, const sp_digit* a,
  *
  * Note that this is an approximate div. It may give an answer 1 larger.
  */
-SP_NOINLINE static sp_digit div_3072_word_48(sp_digit d1, sp_digit d0,
-        sp_digit div)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit div_3072_word_48(sp_digit d1,
+    sp_digit d0, sp_digit div)
 {
+
     __asm__ __volatile__ (
         "movs	r3, #0\n\t"
 #if defined(__clang__) || defined(WOLFSSL_KEIL)
@@ -75218,7 +75321,7 @@ SP_NOINLINE static sp_digit div_3072_word_48(sp_digit d1, sp_digit d0,
         "movs	%[d1], r3\n\t"
         : [d1] "+l" (d1), [d0] "+l" (d0), [div] "+l" (div)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9"
     );
     return (word32)(size_t)d1;
 }
@@ -75230,8 +75333,10 @@ SP_NOINLINE static sp_digit div_3072_word_48(sp_digit d1, sp_digit d0,
  * return -ve, 0 or +ve if a is less than, equal to or greater than b
  * respectively.
  */
-SP_NOINLINE static sp_int32 sp_3072_cmp_48(const sp_digit* a, const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_int32 sp_3072_cmp_48(
+    const sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "movs	r2, #0\n\t"
         "movs	r3, #0\n\t"
@@ -75327,7 +75432,7 @@ SP_NOINLINE static sp_int32 sp_3072_cmp_48(const sp_digit* a, const sp_digit* b)
         "movs	%[a], r2\n\t"
         : [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7"
     );
     return (word32)(size_t)a;
 }
@@ -75708,9 +75813,10 @@ static void sp_3072_mont_norm_96(sp_digit* r, const sp_digit* m)
  * b  A single precision number to subtract.
  * m  Mask value to apply.
  */
-SP_NOINLINE static sp_digit sp_3072_cond_sub_96(sp_digit* r, const sp_digit* a,
-        const sp_digit* b, sp_digit m)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_3072_cond_sub_96(
+    sp_digit* r, const sp_digit* a, const sp_digit* b, sp_digit m)
 {
+
     __asm__ __volatile__ (
         "movs	r4, #0\n\t"
         "movs	r5, #0xff\n\t"
@@ -75763,7 +75869,7 @@ SP_NOINLINE static sp_digit sp_3072_cond_sub_96(sp_digit* r, const sp_digit* a,
         "movs	%[r], r4\n\t"
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b), [m] "+l" (m)
         :
-        : "memory", "r4", "r5", "r6", "r7", "r8", "cc"
+        : "memory", "cc", "r4", "r5", "r6", "r7", "r8"
     );
     return (word32)(size_t)r;
 }
@@ -75775,9 +75881,10 @@ SP_NOINLINE static sp_digit sp_3072_cond_sub_96(sp_digit* r, const sp_digit* a,
  * m   The single precision number representing the modulus.
  * mp  The digit representing the negative inverse of m mod 2^n.
  */
-SP_NOINLINE static void sp_3072_mont_reduce_96(sp_digit* a, const sp_digit* m,
-        sp_digit mp)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_3072_mont_reduce_96(
+    sp_digit* a, const sp_digit* m, sp_digit mp)
 {
+
     __asm__ __volatile__ (
         "movs	r7, #0\n\t"
         "mov	r8, %[mp]\n\t"
@@ -77939,7 +78046,8 @@ SP_NOINLINE static void sp_3072_mont_reduce_96(sp_digit* a, const sp_digit* m,
 #endif /* WOLFSSL_SP_LARGE_CODE */
         : [a] "+l" (a), [m] "+l" (m), [mp] "+l" (mp)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
+            "r11", "r12", "lr"
     );
 }
 
@@ -77980,9 +78088,10 @@ SP_NOINLINE static void sp_3072_mont_sqr_96(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static sp_digit sp_3072_sub_96(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_3072_sub_96(sp_digit* r,
+    const sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "movs	r6, %[a]\n\t"
         "movs	r3, #0\n\t"
@@ -78042,7 +78151,7 @@ SP_NOINLINE static sp_digit sp_3072_sub_96(sp_digit* r, const sp_digit* a,
         "movs	%[r], r3\n\t"
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6"
     );
     return (word32)(size_t)r;
 }
@@ -78054,9 +78163,10 @@ SP_NOINLINE static sp_digit sp_3072_sub_96(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static sp_digit sp_3072_sub_96(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_3072_sub_96(sp_digit* r,
+    const sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "ldm	%[b]!, {r5, r6}\n\t"
         "ldm	%[a]!, {r3, r4}\n\t"
@@ -78881,7 +78991,7 @@ SP_NOINLINE static sp_digit sp_3072_sub_96(sp_digit* r, const sp_digit* a,
 #endif
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6"
     );
     return (word32)(size_t)r;
 }
@@ -78896,9 +79006,10 @@ SP_NOINLINE static sp_digit sp_3072_sub_96(sp_digit* r, const sp_digit* a,
  *
  * Note that this is an approximate div. It may give an answer 1 larger.
  */
-SP_NOINLINE static sp_digit div_3072_word_96(sp_digit d1, sp_digit d0,
-        sp_digit div)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit div_3072_word_96(sp_digit d1,
+    sp_digit d0, sp_digit div)
 {
+
     __asm__ __volatile__ (
         "movs	r3, #0\n\t"
 #if defined(__clang__) || defined(WOLFSSL_KEIL)
@@ -79498,7 +79609,7 @@ SP_NOINLINE static sp_digit div_3072_word_96(sp_digit d1, sp_digit d0,
         "movs	%[d1], r3\n\t"
         : [d1] "+l" (d1), [d0] "+l" (d0), [div] "+l" (div)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9"
     );
     return (word32)(size_t)d1;
 }
@@ -79614,8 +79725,10 @@ static void sp_3072_mask_96(sp_digit* r, const sp_digit* a, sp_digit m)
  * return -ve, 0 or +ve if a is less than, equal to or greater than b
  * respectively.
  */
-SP_NOINLINE static sp_int32 sp_3072_cmp_96(const sp_digit* a, const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_int32 sp_3072_cmp_96(
+    const sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "movs	r2, #0\n\t"
         "movs	r3, #0\n\t"
@@ -79716,7 +79829,7 @@ SP_NOINLINE static sp_int32 sp_3072_cmp_96(const sp_digit* a, const sp_digit* b)
         "movs	%[a], r2\n\t"
         : [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7"
     );
     return (word32)(size_t)a;
 }
@@ -80205,9 +80318,10 @@ int sp_RsaPublic_3072(const byte* in, word32 inLen, const mp_int* em,
  * b  A single precision number to add.
  * m  Mask value to apply.
  */
-SP_NOINLINE static sp_digit sp_3072_cond_add_48(sp_digit* r, const sp_digit* a,
-        const sp_digit* b, sp_digit m)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_3072_cond_add_48(
+    sp_digit* r, const sp_digit* a, const sp_digit* b, sp_digit m)
 {
+
     __asm__ __volatile__ (
         "movs	r4, #0\n\t"
         "movs	r5, #0xc0\n\t"
@@ -80261,7 +80375,7 @@ SP_NOINLINE static sp_digit sp_3072_cond_add_48(sp_digit* r, const sp_digit* a,
         "movs	%[r], r4\n\t"
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b), [m] "+l" (m)
         :
-        : "memory", "r4", "r5", "r6", "r7", "r8", "cc"
+        : "memory", "cc", "r4", "r5", "r6", "r7", "r8"
     );
     return (word32)(size_t)r;
 }
@@ -80545,8 +80659,10 @@ int sp_ModExp_3072(const mp_int* base, const mp_int* exp, const mp_int* mod,
  * a  A single precision integer.
  * n  Integer representing number of bits to shift.
  */
-static void sp_3072_lshift_96(sp_digit* r, const sp_digit* a, byte n)
+WC_OMIT_FRAME_POINTER static void sp_3072_lshift_96(sp_digit* r,
+    const sp_digit* a, byte n)
 {
+
     __asm__ __volatile__ (
         "movs	r7, #31\n\t"
 #if defined(__clang__) || defined(WOLFSSL_KEIL)
@@ -82924,7 +83040,7 @@ static void sp_3072_lshift_96(sp_digit* r, const sp_digit* a, byte n)
         "str	r3, [%[r], #4]\n\t"
         : [r] "+l" (r), [a] "+l" (a), [n] "+l" (n)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7"
     );
 }
 
@@ -83338,9 +83454,10 @@ static void sp_4096_to_bin_128(sp_digit* r, byte* a)
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static sp_digit sp_4096_add_word_64(sp_digit* r, const sp_digit* a,
-        sp_digit b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_4096_add_word_64(
+    sp_digit* r, const sp_digit* a, sp_digit b)
 {
+
     __asm__ __volatile__ (
         "movs	r5, #0\n\t"
         "ldm	%[a]!, {r3, r4}\n\t"
@@ -83863,7 +83980,7 @@ SP_NOINLINE static sp_digit sp_4096_add_word_64(sp_digit* r, const sp_digit* a,
 #endif
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r3", "r4", "r5", "cc"
+        : "memory", "cc", "r3", "r4", "r5"
     );
     return (word32)(size_t)r;
 }
@@ -83873,9 +83990,10 @@ SP_NOINLINE static sp_digit sp_4096_add_word_64(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static sp_digit sp_4096_sub_in_place_128(sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_4096_sub_in_place_128(
+    sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "ldm	%[b]!, {r4, r5}\n\t"
         "ldr	r2, [%[a]]\n\t"
@@ -85036,7 +85154,7 @@ SP_NOINLINE static sp_digit sp_4096_sub_in_place_128(sp_digit* a,
 #endif
         : [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r2", "r3", "r4", "r5", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5"
     );
     return (word32)(size_t)a;
 }
@@ -85047,9 +85165,10 @@ SP_NOINLINE static sp_digit sp_4096_sub_in_place_128(sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static sp_digit sp_4096_add_128(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_4096_add_128(sp_digit* r,
+    const sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "ldm	%[b]!, {r5, r6}\n\t"
         "ldm	%[a]!, {r3, r4}\n\t"
@@ -86147,7 +86266,7 @@ SP_NOINLINE static sp_digit sp_4096_add_128(sp_digit* r, const sp_digit* a,
 #endif
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6"
     );
     return (word32)(size_t)r;
 }
@@ -86232,9 +86351,10 @@ SP_NOINLINE static void sp_4096_sqr_128(sp_digit* r, const sp_digit* a)
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static sp_digit sp_4096_add_128(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_4096_add_128(sp_digit* r,
+    const sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "movs	r6, %[a]\n\t"
         "movs	r7, #0\n\t"
@@ -86300,7 +86420,7 @@ SP_NOINLINE static sp_digit sp_4096_add_128(sp_digit* r, const sp_digit* a,
         "movs	%[r], r3\n\t"
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7"
     );
     return (word32)(size_t)r;
 }
@@ -86312,9 +86432,10 @@ SP_NOINLINE static sp_digit sp_4096_add_128(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static sp_digit sp_4096_sub_in_place_128(sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_4096_sub_in_place_128(
+    sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "movs	r7, %[a]\n\t"
         "movs	r2, #0\n\t"
@@ -86379,7 +86500,7 @@ SP_NOINLINE static sp_digit sp_4096_sub_in_place_128(sp_digit* a,
         "movs	%[a], r2\n\t"
         : [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7"
     );
     return (word32)(size_t)a;
 }
@@ -86392,9 +86513,10 @@ SP_NOINLINE static sp_digit sp_4096_sub_in_place_128(sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static void sp_4096_mul_128(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_4096_mul_128(sp_digit* r,
+    const sp_digit* a, const sp_digit* b)
 {
+
     sp_digit t[128 * 2];
     sp_digit* tmp = t;
     __asm__ __volatile__ (
@@ -86644,8 +86766,10 @@ SP_NOINLINE static void sp_4096_mul_128(sp_digit* r, const sp_digit* a,
         "mov	%[b], r10\n\t"
         : [a] "+l" (a), [b] "+l" (b), [tmp] "+l" (tmp)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
+            "r11", "r12"
     );
+    (void)r;
 
     XMEMCPY(r, t, sizeof(t));
 }
@@ -86655,8 +86779,10 @@ SP_NOINLINE static void sp_4096_mul_128(sp_digit* r, const sp_digit* a,
  * r  A single precision integer.
  * a  A single precision integer.
  */
-SP_NOINLINE static void sp_4096_sqr_128(sp_digit* r, const sp_digit* a)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_4096_sqr_128(sp_digit* r,
+    const sp_digit* a)
 {
+
     __asm__ __volatile__ (
         "movs	r3, #0\n\t"
         "movs	r4, #0\n\t"
@@ -87109,7 +87235,8 @@ SP_NOINLINE static void sp_4096_sqr_128(sp_digit* r, const sp_digit* a)
         "add	sp, sp, r6\n\t"
         : [r] "+l" (r), [a] "+l" (a)
         :
-        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
+            "r11", "r12"
     );
 }
 
@@ -87140,9 +87267,10 @@ static void sp_4096_mont_setup(const sp_digit* a, sp_digit* rho)
  * a  A single precision integer.
  * b  A single precision digit.
  */
-SP_NOINLINE static void sp_4096_mul_d_128(sp_digit* r, const sp_digit* a,
-        sp_digit b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_4096_mul_d_128(sp_digit* r,
+    const sp_digit* a, sp_digit b)
 {
+
     __asm__ __volatile__ (
         "movs	r6, #2\n\t"
 #if defined(__clang__) || defined(WOLFSSL_KEIL)
@@ -87321,7 +87449,7 @@ SP_NOINLINE static void sp_4096_mul_d_128(sp_digit* r, const sp_digit* a,
         "str	r3, [%[r]]\n\t"
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9"
     );
 }
 
@@ -87349,9 +87477,10 @@ static void sp_4096_mont_norm_128(sp_digit* r, const sp_digit* m)
  * b  A single precision number to subtract.
  * m  Mask value to apply.
  */
-SP_NOINLINE static sp_digit sp_4096_cond_sub_128(sp_digit* r, const sp_digit* a,
-        const sp_digit* b, sp_digit m)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_4096_cond_sub_128(
+    sp_digit* r, const sp_digit* a, const sp_digit* b, sp_digit m)
 {
+
     __asm__ __volatile__ (
         "movs	r4, #0\n\t"
         "movs	r5, #2\n\t"
@@ -87404,7 +87533,7 @@ SP_NOINLINE static sp_digit sp_4096_cond_sub_128(sp_digit* r, const sp_digit* a,
         "movs	%[r], r4\n\t"
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b), [m] "+l" (m)
         :
-        : "memory", "r4", "r5", "r6", "r7", "r8", "cc"
+        : "memory", "cc", "r4", "r5", "r6", "r7", "r8"
     );
     return (word32)(size_t)r;
 }
@@ -87416,9 +87545,10 @@ SP_NOINLINE static sp_digit sp_4096_cond_sub_128(sp_digit* r, const sp_digit* a,
  * m   The single precision number representing the modulus.
  * mp  The digit representing the negative inverse of m mod 2^n.
  */
-SP_NOINLINE static void sp_4096_mont_reduce_128(sp_digit* a, const sp_digit* m,
-        sp_digit mp)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_4096_mont_reduce_128(
+    sp_digit* a, const sp_digit* m, sp_digit mp)
 {
+
     __asm__ __volatile__ (
         "movs	r7, #0\n\t"
         "mov	r8, %[mp]\n\t"
@@ -90124,7 +90254,8 @@ SP_NOINLINE static void sp_4096_mont_reduce_128(sp_digit* a, const sp_digit* m,
 #endif /* WOLFSSL_SP_LARGE_CODE */
         : [a] "+l" (a), [m] "+l" (m), [mp] "+l" (mp)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
+            "r11", "r12", "lr"
     );
 }
 
@@ -90165,9 +90296,10 @@ SP_NOINLINE static void sp_4096_mont_sqr_128(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static sp_digit sp_4096_sub_128(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_4096_sub_128(sp_digit* r,
+    const sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "movs	r6, %[a]\n\t"
         "movs	r3, #0\n\t"
@@ -90227,7 +90359,7 @@ SP_NOINLINE static sp_digit sp_4096_sub_128(sp_digit* r, const sp_digit* a,
         "movs	%[r], r3\n\t"
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6"
     );
     return (word32)(size_t)r;
 }
@@ -90239,9 +90371,10 @@ SP_NOINLINE static sp_digit sp_4096_sub_128(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static sp_digit sp_4096_sub_128(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_4096_sub_128(sp_digit* r,
+    const sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "ldm	%[b]!, {r5, r6}\n\t"
         "ldm	%[a]!, {r3, r4}\n\t"
@@ -91338,7 +91471,7 @@ SP_NOINLINE static sp_digit sp_4096_sub_128(sp_digit* r, const sp_digit* a,
 #endif
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6"
     );
     return (word32)(size_t)r;
 }
@@ -91353,9 +91486,10 @@ SP_NOINLINE static sp_digit sp_4096_sub_128(sp_digit* r, const sp_digit* a,
  *
  * Note that this is an approximate div. It may give an answer 1 larger.
  */
-SP_NOINLINE static sp_digit div_4096_word_128(sp_digit d1, sp_digit d0,
-        sp_digit div)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit div_4096_word_128(sp_digit d1,
+    sp_digit d0, sp_digit div)
 {
+
     __asm__ __volatile__ (
         "movs	r3, #0\n\t"
 #if defined(__clang__) || defined(WOLFSSL_KEIL)
@@ -91955,7 +92089,7 @@ SP_NOINLINE static sp_digit div_4096_word_128(sp_digit d1, sp_digit d0,
         "movs	%[d1], r3\n\t"
         : [d1] "+l" (d1), [d0] "+l" (d0), [div] "+l" (div)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9"
     );
     return (word32)(size_t)d1;
 }
@@ -92071,9 +92205,10 @@ static void sp_4096_mask_128(sp_digit* r, const sp_digit* a, sp_digit m)
  * return -ve, 0 or +ve if a is less than, equal to or greater than b
  * respectively.
  */
-SP_NOINLINE static sp_int32 sp_4096_cmp_128(const sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_int32 sp_4096_cmp_128(
+    const sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "movs	r2, #0\n\t"
         "movs	r3, #0\n\t"
@@ -92174,7 +92309,7 @@ SP_NOINLINE static sp_int32 sp_4096_cmp_128(const sp_digit* a,
         "movs	%[a], r2\n\t"
         : [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7"
     );
     return (word32)(size_t)a;
 }
@@ -92663,9 +92798,10 @@ int sp_RsaPublic_4096(const byte* in, word32 inLen, const mp_int* em,
  * b  A single precision number to add.
  * m  Mask value to apply.
  */
-SP_NOINLINE static sp_digit sp_4096_cond_add_64(sp_digit* r, const sp_digit* a,
-        const sp_digit* b, sp_digit m)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_4096_cond_add_64(
+    sp_digit* r, const sp_digit* a, const sp_digit* b, sp_digit m)
 {
+
     __asm__ __volatile__ (
         "movs	r4, #0\n\t"
         "movs	r5, #0xff\n\t"
@@ -92724,7 +92860,7 @@ SP_NOINLINE static sp_digit sp_4096_cond_add_64(sp_digit* r, const sp_digit* a,
         "movs	%[r], r4\n\t"
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b), [m] "+l" (m)
         :
-        : "memory", "r4", "r5", "r6", "r7", "r8", "cc"
+        : "memory", "cc", "r4", "r5", "r6", "r7", "r8"
     );
     return (word32)(size_t)r;
 }
@@ -93008,8 +93144,10 @@ int sp_ModExp_4096(const mp_int* base, const mp_int* exp, const mp_int* mod,
  * a  A single precision integer.
  * n  Integer representing number of bits to shift.
  */
-static void sp_4096_lshift_128(sp_digit* r, const sp_digit* a, byte n)
+WC_OMIT_FRAME_POINTER static void sp_4096_lshift_128(sp_digit* r,
+    const sp_digit* a, byte n)
 {
+
     __asm__ __volatile__ (
         "movs	r7, #31\n\t"
 #if defined(__clang__) || defined(WOLFSSL_KEIL)
@@ -96175,7 +96313,7 @@ static void sp_4096_lshift_128(sp_digit* r, const sp_digit* a, byte n)
         "str	r4, [%[r], #4]\n\t"
         : [r] "+l" (r), [a] "+l" (a), [n] "+l" (n)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7"
     );
 }
 
@@ -96455,9 +96593,10 @@ static const sp_digit p256_b[8] = {
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static void sp_256_mul_8(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_mul_8(sp_digit* r,
+    const sp_digit* a, const sp_digit* b)
 {
+
     sp_digit t[8 * 2];
     sp_digit* tmp = t;
     __asm__ __volatile__ (
@@ -96687,8 +96826,10 @@ SP_NOINLINE static void sp_256_mul_8(sp_digit* r, const sp_digit* a,
         "mov	%[b], r10\n\t"
         : [a] "+l" (a), [b] "+l" (b), [tmp] "+l" (tmp)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
+            "r11", "r12"
     );
+    (void)r;
 
     XMEMCPY(r, t, sizeof(t));
 }
@@ -96698,8 +96839,10 @@ SP_NOINLINE static void sp_256_mul_8(sp_digit* r, const sp_digit* a,
  * r  A single precision integer.
  * a  A single precision integer.
  */
-SP_NOINLINE static void sp_256_sqr_8(sp_digit* r, const sp_digit* a)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_sqr_8(sp_digit* r,
+    const sp_digit* a)
 {
+
     __asm__ __volatile__ (
         "movs	r3, #0\n\t"
         "movs	r4, #0\n\t"
@@ -97112,7 +97255,8 @@ SP_NOINLINE static void sp_256_sqr_8(sp_digit* r, const sp_digit* a)
         "add	sp, sp, r6\n\t"
         : [r] "+l" (r), [a] "+l" (a)
         :
-        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
+            "r11", "r12"
     );
 }
 
@@ -97123,9 +97267,10 @@ SP_NOINLINE static void sp_256_sqr_8(sp_digit* r, const sp_digit* a)
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static sp_digit sp_256_add_8(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_256_add_8(sp_digit* r,
+    const sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "movs	r6, %[a]\n\t"
         "movs	r7, #0\n\t"
@@ -97185,7 +97330,7 @@ SP_NOINLINE static sp_digit sp_256_add_8(sp_digit* r, const sp_digit* a,
         "movs	%[r], r3\n\t"
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7"
     );
     return (word32)(size_t)r;
 }
@@ -97197,9 +97342,10 @@ SP_NOINLINE static sp_digit sp_256_add_8(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static sp_digit sp_256_add_8(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_256_add_8(sp_digit* r,
+    const sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "ldm	%[b]!, {r5, r6}\n\t"
         "ldm	%[a]!, {r3, r4}\n\t"
@@ -97277,7 +97423,7 @@ SP_NOINLINE static sp_digit sp_256_add_8(sp_digit* r, const sp_digit* a,
 #endif
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6"
     );
     return (word32)(size_t)r;
 }
@@ -97567,9 +97713,10 @@ static int sp_256_point_to_ecc_point_8(const sp_point_256* p, ecc_point* pm)
  * m   The single precision number representing the modulus.
  * mp  The digit representing the negative inverse of m mod 2^n.
  */
-SP_NOINLINE static void sp_256_mont_reduce_8(sp_digit* a, const sp_digit* m,
-        sp_digit mp)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_mont_reduce_8(sp_digit* a,
+    const sp_digit* m, sp_digit mp)
 {
+
     (void)mp;
     (void)m;
 
@@ -97877,8 +98024,10 @@ SP_NOINLINE static void sp_256_mont_reduce_8(sp_digit* a, const sp_digit* m,
         "str	r7, [%[a], #28]\n\t"
         : [a] "+l" (a)
         :
-        : "memory", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "cc"
+        : "memory", "cc", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8"
     );
+    (void)m;
+    (void)mp;
 }
 
 /* Reduce the number back to 256 bits using Montgomery reduction.
@@ -97887,9 +98036,10 @@ SP_NOINLINE static void sp_256_mont_reduce_8(sp_digit* a, const sp_digit* m,
  * m   The single precision number representing the modulus.
  * mp  The digit representing the negative inverse of m mod 2^n.
  */
-SP_NOINLINE static void sp_256_mont_reduce_order_8(sp_digit* a,
-        const sp_digit* m, sp_digit mp)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_mont_reduce_order_8(
+    sp_digit* a, const sp_digit* m, sp_digit mp)
 {
+
     __asm__ __volatile__ (
         "movs	r7, #0\n\t"
         "mov	r8, %[mp]\n\t"
@@ -98535,7 +98685,8 @@ SP_NOINLINE static void sp_256_mont_reduce_order_8(sp_digit* a,
 #endif /* WOLFSSL_SP_LARGE_CODE */
         : [a] "+l" (a), [m] "+l" (m), [mp] "+l" (mp)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
+            "r11", "r12", "lr"
     );
 }
 
@@ -98672,8 +98823,10 @@ static void sp_256_mont_inv_8(sp_digit* r, const sp_digit* a, sp_digit* td)
  * return -ve, 0 or +ve if a is less than, equal to or greater than b
  * respectively.
  */
-SP_NOINLINE static sp_int32 sp_256_cmp_8(const sp_digit* a, const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_int32 sp_256_cmp_8(
+    const sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "movs	r2, #0\n\t"
         "movs	r3, #0\n\t"
@@ -98769,7 +98922,7 @@ SP_NOINLINE static sp_int32 sp_256_cmp_8(const sp_digit* a, const sp_digit* b)
         "movs	%[a], r2\n\t"
         : [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7"
     );
     return (word32)(size_t)a;
 }
@@ -98788,9 +98941,10 @@ SP_NOINLINE static sp_int32 sp_256_cmp_8(const sp_digit* a, const sp_digit* b)
  * b  A single precision number to subtract.
  * m  Mask value to apply.
  */
-SP_NOINLINE static sp_digit sp_256_cond_sub_8(sp_digit* r, const sp_digit* a,
-        const sp_digit* b, sp_digit m)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_256_cond_sub_8(sp_digit* r,
+    const sp_digit* a, const sp_digit* b, sp_digit m)
 {
+
     __asm__ __volatile__ (
         "movs	r4, #0\n\t"
         "movs	r5, #32\n\t"
@@ -98838,7 +98992,7 @@ SP_NOINLINE static sp_digit sp_256_cond_sub_8(sp_digit* r, const sp_digit* a,
         "movs	%[r], r4\n\t"
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b), [m] "+l" (m)
         :
-        : "memory", "r4", "r5", "r6", "r7", "r8", "cc"
+        : "memory", "cc", "r4", "r5", "r6", "r7", "r8"
     );
     return (word32)(size_t)r;
 }
@@ -98890,9 +99044,10 @@ static void sp_256_map_8(sp_point_256* r, const sp_point_256* p,
  * b   Second number to add in Montgomery form.
  * m   Modulus (prime).
  */
-SP_NOINLINE static void sp_256_mont_add_8(sp_digit* r, const sp_digit* a,
-        const sp_digit* b, const sp_digit* m)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_mont_add_8(sp_digit* r,
+    const sp_digit* a, const sp_digit* b, const sp_digit* m)
 {
+
     (void)m;
     __asm__ __volatile__ (
         "movs	r3, #0\n\t"
@@ -99065,8 +99220,10 @@ SP_NOINLINE static void sp_256_mont_add_8(sp_digit* r, const sp_digit* a,
         "str	r5, [%[r], #28]\n\t"
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
+            "r11"
     );
+    (void)m;
 }
 
 /* Double a Montgomery form number (r = a + a % m).
@@ -99075,9 +99232,10 @@ SP_NOINLINE static void sp_256_mont_add_8(sp_digit* r, const sp_digit* a,
  * a   Number to double in Montgomery form.
  * m   Modulus (prime).
  */
-SP_NOINLINE static void sp_256_mont_dbl_8(sp_digit* r, const sp_digit* a,
-        const sp_digit* m)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_mont_dbl_8(sp_digit* r,
+    const sp_digit* a, const sp_digit* m)
 {
+
     (void)m;
     __asm__ __volatile__ (
         "ldr	r4, [%[a]]\n\t"
@@ -99242,8 +99400,10 @@ SP_NOINLINE static void sp_256_mont_dbl_8(sp_digit* r, const sp_digit* a,
         "str	r5, [%[r], #28]\n\t"
         : [r] "+l" (r), [a] "+l" (a)
         :
-        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
+            "r11"
     );
+    (void)m;
 }
 
 /* Triple a Montgomery form number (r = a + a + a % m).
@@ -99252,9 +99412,10 @@ SP_NOINLINE static void sp_256_mont_dbl_8(sp_digit* r, const sp_digit* a,
  * a   Number to triple in Montgomery form.
  * m   Modulus (prime).
  */
-SP_NOINLINE static void sp_256_mont_tpl_8(sp_digit* r, const sp_digit* a,
-        const sp_digit* m)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_mont_tpl_8(sp_digit* r,
+    const sp_digit* a, const sp_digit* m)
 {
+
     (void)m;
     __asm__ __volatile__ (
         "ldr	r6, [%[a]]\n\t"
@@ -99575,8 +99736,10 @@ SP_NOINLINE static void sp_256_mont_tpl_8(sp_digit* r, const sp_digit* a,
         "str	r2, [%[r], #28]\n\t"
         : [r] "+l" (r), [a] "+l" (a)
         :
-        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
+            "r11", "r12", "lr"
     );
+    (void)m;
 }
 
 /* Subtract two Montgomery form numbers (r = a - b % m).
@@ -99586,9 +99749,10 @@ SP_NOINLINE static void sp_256_mont_tpl_8(sp_digit* r, const sp_digit* a,
  * b   Number to subtract with in Montgomery form.
  * m   Modulus (prime).
  */
-SP_NOINLINE static void sp_256_mont_sub_8(sp_digit* r, const sp_digit* a,
-        const sp_digit* b, const sp_digit* m)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_mont_sub_8(sp_digit* r,
+    const sp_digit* a, const sp_digit* b, const sp_digit* m)
 {
+
     (void)m;
     __asm__ __volatile__ (
         "ldr	r4, [%[a]]\n\t"
@@ -99754,8 +99918,10 @@ SP_NOINLINE static void sp_256_mont_sub_8(sp_digit* r, const sp_digit* a,
         "str	r5, [%[r], #28]\n\t"
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
+            "r11"
     );
+    (void)m;
 }
 
 /* Divide the number by 2 mod the modulus (prime). (r = a / 2 % m)
@@ -99764,9 +99930,10 @@ SP_NOINLINE static void sp_256_mont_sub_8(sp_digit* r, const sp_digit* a,
  * a  Number to divide.
  * m  Modulus (prime).
  */
-SP_NOINLINE static void sp_256_mont_div2_8(sp_digit* r, const sp_digit* a,
-        const sp_digit* m)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_mont_div2_8(sp_digit* r,
+    const sp_digit* a, const sp_digit* m)
 {
+
     (void)m;
     __asm__ __volatile__ (
         "ldr	r6, [%[a]]\n\t"
@@ -100027,8 +100194,9 @@ SP_NOINLINE static void sp_256_mont_div2_8(sp_digit* r, const sp_digit* a,
         "str	r5, [%[r], #4]\n\t"
         : [r] "+l" (r), [a] "+l" (a)
         :
-        : "memory", "r2", "r3", "r4", "r5", "r6", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5", "r6"
     );
+    (void)m;
 }
 
 /* Double the Montgomery form projective point p.
@@ -103334,8 +103502,9 @@ int sp_ecc_mulmod_base_add_256(const mp_int* km, const ecc_point* am,
  *
  * a  A single precision integer.
  */
-SP_NOINLINE static void sp_256_add_one_8(sp_digit* a)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_add_one_8(sp_digit* a)
 {
+
     __asm__ __volatile__ (
         "movs	r2, #1\n\t"
         "ldr	r1, [%[a]]\n\t"
@@ -103411,7 +103580,7 @@ SP_NOINLINE static void sp_256_add_one_8(sp_digit* a)
         "str	r1, [%[a], #28]\n\t"
         : [a] "+l" (a)
         :
-        : "memory", "r1", "r2", "cc"
+        : "memory", "cc", "r1", "r2"
     );
 }
 
@@ -103764,9 +103933,10 @@ int sp_ecc_secret_gen_256_nb(sp_ecc_ctx_t* sp_ctx, const mp_int* priv,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static sp_digit sp_256_sub_in_place_8(sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_256_sub_in_place_8(
+    sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "movs	r7, %[a]\n\t"
         "movs	r2, #0\n\t"
@@ -103825,7 +103995,7 @@ SP_NOINLINE static sp_digit sp_256_sub_in_place_8(sp_digit* a,
         "movs	%[a], r2\n\t"
         : [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7"
     );
     return (word32)(size_t)a;
 }
@@ -103836,9 +104006,10 @@ SP_NOINLINE static sp_digit sp_256_sub_in_place_8(sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static sp_digit sp_256_sub_in_place_8(sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_256_sub_in_place_8(
+    sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "ldm	%[b]!, {r4, r5}\n\t"
         "ldr	r2, [%[a]]\n\t"
@@ -103919,7 +104090,7 @@ SP_NOINLINE static sp_digit sp_256_sub_in_place_8(sp_digit* a,
 #endif
         : [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r2", "r3", "r4", "r5", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5"
     );
     return (word32)(size_t)a;
 }
@@ -103931,9 +104102,10 @@ SP_NOINLINE static sp_digit sp_256_sub_in_place_8(sp_digit* a,
  * a  A single precision integer.
  * b  A single precision digit.
  */
-SP_NOINLINE static void sp_256_mul_d_8(sp_digit* r, const sp_digit* a,
-        sp_digit b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_mul_d_8(sp_digit* r,
+    const sp_digit* a, sp_digit b)
 {
+
     __asm__ __volatile__ (
         "movs	r6, #32\n\t"
 #if defined(__clang__) || defined(WOLFSSL_KEIL)
@@ -104107,7 +104279,7 @@ SP_NOINLINE static void sp_256_mul_d_8(sp_digit* r, const sp_digit* a,
         "str	r3, [%[r]]\n\t"
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9"
     );
 }
 
@@ -104120,9 +104292,10 @@ SP_NOINLINE static void sp_256_mul_d_8(sp_digit* r, const sp_digit* a,
  *
  * Note that this is an approximate div. It may give an answer 1 larger.
  */
-SP_NOINLINE static sp_digit div_256_word_8(sp_digit d1, sp_digit d0,
-        sp_digit div)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit div_256_word_8(sp_digit d1,
+    sp_digit d0, sp_digit div)
 {
+
     __asm__ __volatile__ (
         "movs	r3, #0\n\t"
 #if defined(__clang__) || defined(WOLFSSL_KEIL)
@@ -104722,7 +104895,7 @@ SP_NOINLINE static sp_digit div_256_word_8(sp_digit d1, sp_digit d0,
         "movs	%[d1], r3\n\t"
         : [d1] "+l" (d1), [d0] "+l" (d0), [div] "+l" (div)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9"
     );
     return (word32)(size_t)d1;
 }
@@ -105347,9 +105520,10 @@ int sp_ecc_sign_256_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, word32 hashLen, W
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static sp_digit sp_256_sub_8(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_256_sub_8(sp_digit* r,
+    const sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "movs	r6, %[a]\n\t"
         "movs	r3, #0\n\t"
@@ -105403,7 +105577,7 @@ SP_NOINLINE static sp_digit sp_256_sub_8(sp_digit* r, const sp_digit* a,
         "movs	%[r], r3\n\t"
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6"
     );
     return (word32)(size_t)r;
 }
@@ -105415,9 +105589,10 @@ SP_NOINLINE static sp_digit sp_256_sub_8(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static sp_digit sp_256_sub_8(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_256_sub_8(sp_digit* r,
+    const sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "ldm	%[b]!, {r5, r6}\n\t"
         "ldm	%[a]!, {r3, r4}\n\t"
@@ -105494,7 +105669,7 @@ SP_NOINLINE static sp_digit sp_256_sub_8(sp_digit* r, const sp_digit* a,
 #endif
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6"
     );
     return (word32)(size_t)r;
 }
@@ -105505,8 +105680,10 @@ SP_NOINLINE static sp_digit sp_256_sub_8(sp_digit* r, const sp_digit* a,
  * r  A single precision integer.
  * a  A single precision integer.
  */
-static void sp_256_rshift1_8(sp_digit* r, const sp_digit* a)
+WC_OMIT_FRAME_POINTER static void sp_256_rshift1_8(sp_digit* r,
+    const sp_digit* a)
 {
+
     __asm__ __volatile__ (
         "ldr	r2, [%[a]]\n\t"
         "ldr	r3, [%[a], #4]\n\t"
@@ -105650,7 +105827,7 @@ static void sp_256_rshift1_8(sp_digit* r, const sp_digit* a)
         "str	r3, [%[r], #28]\n\t"
         : [r] "+l" (r), [a] "+l" (a)
         :
-        : "memory", "r2", "r3", "r4", "r5", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5"
     );
 }
 
@@ -105660,8 +105837,10 @@ static void sp_256_rshift1_8(sp_digit* r, const sp_digit* a)
  * a  Number to divide.
  * m  Modulus.
  */
-static void sp_256_div2_mod_8(sp_digit* r, const sp_digit* a, const sp_digit* m)
+WC_OMIT_FRAME_POINTER static void sp_256_div2_mod_8(sp_digit* r,
+    const sp_digit* a, const sp_digit* m)
 {
+
     __asm__ __volatile__ (
         "ldr	r7, [%[a]]\n\t"
 #if defined(__clang__) || defined(WOLFSSL_KEIL)
@@ -105921,12 +106100,13 @@ static void sp_256_div2_mod_8(sp_digit* r, const sp_digit* a, const sp_digit* m)
         "str	r6, [%[r], #4]\n\t"
         : [r] "+l" (r), [a] "+l" (a), [m] "+l" (m)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8"
     );
 }
 
-static int sp_256_num_bits_8(sp_digit* a)
+WC_OMIT_FRAME_POINTER static int sp_256_num_bits_8(sp_digit* a)
 {
+
     static const byte sp_num_bits_table[256] = {
         0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
         5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
@@ -106676,7 +106856,7 @@ static int sp_256_num_bits_8(sp_digit* a)
         "movs	%[a], r2\n\t"
         : [a] "+l" (a), [table] "+l" (table)
         :
-        : "memory", "r2", "r3", "r4", "r5", "r6", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5", "r6"
     );
     return (word32)(size_t)a;
 }
@@ -107615,9 +107795,10 @@ static const sp_digit p384_b[12] = {
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static void sp_384_mul_12(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_384_mul_12(sp_digit* r,
+    const sp_digit* a, const sp_digit* b)
 {
+
     sp_digit t[12 * 2];
     sp_digit* tmp = t;
     __asm__ __volatile__ (
@@ -107847,8 +108028,10 @@ SP_NOINLINE static void sp_384_mul_12(sp_digit* r, const sp_digit* a,
         "mov	%[b], r10\n\t"
         : [a] "+l" (a), [b] "+l" (b), [tmp] "+l" (tmp)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
+            "r11", "r12"
     );
+    (void)r;
 
     XMEMCPY(r, t, sizeof(t));
 }
@@ -107858,8 +108041,10 @@ SP_NOINLINE static void sp_384_mul_12(sp_digit* r, const sp_digit* a,
  * r  A single precision integer.
  * a  A single precision integer.
  */
-SP_NOINLINE static void sp_384_sqr_12(sp_digit* r, const sp_digit* a)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_384_sqr_12(sp_digit* r,
+    const sp_digit* a)
 {
+
     __asm__ __volatile__ (
         "movs	r3, #0\n\t"
         "movs	r4, #0\n\t"
@@ -108272,7 +108457,8 @@ SP_NOINLINE static void sp_384_sqr_12(sp_digit* r, const sp_digit* a)
         "add	sp, sp, r6\n\t"
         : [r] "+l" (r), [a] "+l" (a)
         :
-        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
+            "r11", "r12"
     );
 }
 
@@ -108283,9 +108469,10 @@ SP_NOINLINE static void sp_384_sqr_12(sp_digit* r, const sp_digit* a)
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static sp_digit sp_384_add_12(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_384_add_12(sp_digit* r,
+    const sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "movs	r6, %[a]\n\t"
         "movs	r7, #0\n\t"
@@ -108345,7 +108532,7 @@ SP_NOINLINE static sp_digit sp_384_add_12(sp_digit* r, const sp_digit* a,
         "movs	%[r], r3\n\t"
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7"
     );
     return (word32)(size_t)r;
 }
@@ -108357,9 +108544,10 @@ SP_NOINLINE static sp_digit sp_384_add_12(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static sp_digit sp_384_add_12(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_384_add_12(sp_digit* r,
+    const sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "ldm	%[b]!, {r5, r6}\n\t"
         "ldm	%[a]!, {r3, r4}\n\t"
@@ -108471,7 +108659,7 @@ SP_NOINLINE static sp_digit sp_384_add_12(sp_digit* r, const sp_digit* a,
 #endif
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6"
     );
     return (word32)(size_t)r;
 }
@@ -108769,9 +108957,10 @@ static int sp_384_point_to_ecc_point_12(const sp_point_384* p, ecc_point* pm)
  * b  A single precision number to subtract.
  * m  Mask value to apply.
  */
-SP_NOINLINE static sp_digit sp_384_cond_sub_12(sp_digit* r, const sp_digit* a,
-        const sp_digit* b, sp_digit m)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_384_cond_sub_12(
+    sp_digit* r, const sp_digit* a, const sp_digit* b, sp_digit m)
 {
+
     __asm__ __volatile__ (
         "movs	r4, #0\n\t"
         "movs	r5, #48\n\t"
@@ -108819,7 +109008,7 @@ SP_NOINLINE static sp_digit sp_384_cond_sub_12(sp_digit* r, const sp_digit* a,
         "movs	%[r], r4\n\t"
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b), [m] "+l" (m)
         :
-        : "memory", "r4", "r5", "r6", "r7", "r8", "cc"
+        : "memory", "cc", "r4", "r5", "r6", "r7", "r8"
     );
     return (word32)(size_t)r;
 }
@@ -108832,9 +109021,10 @@ SP_NOINLINE static sp_digit sp_384_cond_sub_12(sp_digit* r, const sp_digit* a,
  * m   The single precision number representing the modulus.
  * mp  The digit representing the negative inverse of m mod 2^n.
  */
-SP_NOINLINE static void sp_384_mont_reduce_12(sp_digit* a, const sp_digit* m,
-        sp_digit mp)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_384_mont_reduce_12(sp_digit* a,
+    const sp_digit* m, sp_digit mp)
 {
+
     __asm__ __volatile__ (
         "movs	r7, #0\n\t"
         "mov	r8, %[mp]\n\t"
@@ -109548,7 +109738,8 @@ SP_NOINLINE static void sp_384_mont_reduce_12(sp_digit* a, const sp_digit* m,
 #endif /* WOLFSSL_SP_LARGE_CODE */
         : [a] "+l" (a), [m] "+l" (m), [mp] "+l" (mp)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
+            "r11", "r12", "lr"
     );
 }
 
@@ -109701,8 +109892,10 @@ static void sp_384_mont_inv_12(sp_digit* r, const sp_digit* a, sp_digit* td)
  * return -ve, 0 or +ve if a is less than, equal to or greater than b
  * respectively.
  */
-SP_NOINLINE static sp_int32 sp_384_cmp_12(const sp_digit* a, const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_int32 sp_384_cmp_12(
+    const sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "movs	r2, #0\n\t"
         "movs	r3, #0\n\t"
@@ -109798,7 +109991,7 @@ SP_NOINLINE static sp_int32 sp_384_cmp_12(const sp_digit* a, const sp_digit* b)
         "movs	%[a], r2\n\t"
         : [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7"
     );
     return (word32)(size_t)a;
 }
@@ -109856,9 +110049,10 @@ static void sp_384_map_12(sp_point_384* r, const sp_point_384* p,
  * b   Second number to add in Montgomery form.
  * m   Modulus (prime).
  */
-SP_NOINLINE static void sp_384_mont_add_12(sp_digit* r, const sp_digit* a,
-        const sp_digit* b, const sp_digit* m)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_384_mont_add_12(sp_digit* r,
+    const sp_digit* a, const sp_digit* b, const sp_digit* m)
 {
+
     sp_digit o;
 
     o = sp_384_add_12(r, a, b);
@@ -109871,9 +110065,10 @@ SP_NOINLINE static void sp_384_mont_add_12(sp_digit* r, const sp_digit* a,
  * a   Number to double in Montgomery form.
  * m   Modulus (prime).
  */
-SP_NOINLINE static void sp_384_mont_dbl_12(sp_digit* r, const sp_digit* a,
-        const sp_digit* m)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_384_mont_dbl_12(sp_digit* r,
+    const sp_digit* a, const sp_digit* m)
 {
+
     sp_digit o;
 
     o = sp_384_add_12(r, a, a);
@@ -109886,9 +110081,10 @@ SP_NOINLINE static void sp_384_mont_dbl_12(sp_digit* r, const sp_digit* a,
  * a   Number to triple in Montgomery form.
  * m   Modulus (prime).
  */
-SP_NOINLINE static void sp_384_mont_tpl_12(sp_digit* r, const sp_digit* a,
-        const sp_digit* m)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_384_mont_tpl_12(sp_digit* r,
+    const sp_digit* a, const sp_digit* m)
 {
+
     sp_digit o;
 
     o = sp_384_add_12(r, a, a);
@@ -109904,9 +110100,10 @@ SP_NOINLINE static void sp_384_mont_tpl_12(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static sp_digit sp_384_sub_12(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_384_sub_12(sp_digit* r,
+    const sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "movs	r6, %[a]\n\t"
         "movs	r3, #0\n\t"
@@ -109960,7 +110157,7 @@ SP_NOINLINE static sp_digit sp_384_sub_12(sp_digit* r, const sp_digit* a,
         "movs	%[r], r3\n\t"
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6"
     );
     return (word32)(size_t)r;
 }
@@ -109972,9 +110169,10 @@ SP_NOINLINE static sp_digit sp_384_sub_12(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static sp_digit sp_384_sub_12(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_384_sub_12(sp_digit* r,
+    const sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "ldm	%[b]!, {r5, r6}\n\t"
         "ldm	%[a]!, {r3, r4}\n\t"
@@ -110085,7 +110283,7 @@ SP_NOINLINE static sp_digit sp_384_sub_12(sp_digit* r, const sp_digit* a,
 #endif
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6"
     );
     return (word32)(size_t)r;
 }
@@ -110099,9 +110297,10 @@ SP_NOINLINE static sp_digit sp_384_sub_12(sp_digit* r, const sp_digit* a,
  * b  A single precision number to add.
  * m  Mask value to apply.
  */
-SP_NOINLINE static sp_digit sp_384_cond_add_12(sp_digit* r, const sp_digit* a,
-        const sp_digit* b, sp_digit m)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_384_cond_add_12(
+    sp_digit* r, const sp_digit* a, const sp_digit* b, sp_digit m)
 {
+
     __asm__ __volatile__ (
         "movs	r4, #0\n\t"
         "movs	r5, #48\n\t"
@@ -110155,7 +110354,7 @@ SP_NOINLINE static sp_digit sp_384_cond_add_12(sp_digit* r, const sp_digit* a,
         "movs	%[r], r4\n\t"
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b), [m] "+l" (m)
         :
-        : "memory", "r4", "r5", "r6", "r7", "r8", "cc"
+        : "memory", "cc", "r4", "r5", "r6", "r7", "r8"
     );
     return (word32)(size_t)r;
 }
@@ -110167,9 +110366,10 @@ SP_NOINLINE static sp_digit sp_384_cond_add_12(sp_digit* r, const sp_digit* a,
  * b   Number to subtract with in Montgomery form.
  * m   Modulus (prime).
  */
-SP_NOINLINE static void sp_384_mont_sub_12(sp_digit* r, const sp_digit* a,
-        const sp_digit* b, const sp_digit* m)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_384_mont_sub_12(sp_digit* r,
+    const sp_digit* a, const sp_digit* b, const sp_digit* m)
 {
+
     sp_digit o;
 
     o = sp_384_sub_12(r, a, b);
@@ -110181,8 +110381,10 @@ SP_NOINLINE static void sp_384_mont_sub_12(sp_digit* r, const sp_digit* a,
  * r  A single precision integer.
  * a  A single precision integer.
  */
-static void sp_384_rshift1_12(sp_digit* r, const sp_digit* a)
+WC_OMIT_FRAME_POINTER static void sp_384_rshift1_12(sp_digit* r,
+    const sp_digit* a)
 {
+
     __asm__ __volatile__ (
         "ldr	r2, [%[a]]\n\t"
         "ldr	r3, [%[a], #4]\n\t"
@@ -110402,7 +110604,7 @@ static void sp_384_rshift1_12(sp_digit* r, const sp_digit* a)
         "str	r4, [%[r], #44]\n\t"
         : [r] "+l" (r), [a] "+l" (a)
         :
-        : "memory", "r2", "r3", "r4", "r5", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5"
     );
 }
 
@@ -110412,9 +110614,10 @@ static void sp_384_rshift1_12(sp_digit* r, const sp_digit* a)
  * a  Number to divide.
  * m  Modulus (prime).
  */
-SP_NOINLINE static void sp_384_mont_div2_12(sp_digit* r, const sp_digit* a,
-        const sp_digit* m)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_384_mont_div2_12(sp_digit* r,
+    const sp_digit* a, const sp_digit* m)
 {
+
     sp_digit o;
 
     o = sp_384_cond_add_12(r, a, m, 0 - (a[0] & 1));
@@ -113783,8 +113986,9 @@ int sp_ecc_mulmod_base_add_384(const mp_int* km, const ecc_point* am,
  *
  * a  A single precision integer.
  */
-SP_NOINLINE static void sp_384_add_one_12(sp_digit* a)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_384_add_one_12(sp_digit* a)
 {
+
     __asm__ __volatile__ (
         "movs	r2, #1\n\t"
         "ldr	r1, [%[a]]\n\t"
@@ -113896,7 +114100,7 @@ SP_NOINLINE static void sp_384_add_one_12(sp_digit* a)
         "str	r1, [%[a], #44]\n\t"
         : [a] "+l" (a)
         :
-        : "memory", "r1", "r2", "cc"
+        : "memory", "cc", "r1", "r2"
     );
 }
 
@@ -114249,9 +114453,10 @@ int sp_ecc_secret_gen_384_nb(sp_ecc_ctx_t* sp_ctx, const mp_int* priv,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static sp_digit sp_384_sub_in_place_12(sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_384_sub_in_place_12(
+    sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "movs	r7, %[a]\n\t"
         "movs	r2, #0\n\t"
@@ -114310,7 +114515,7 @@ SP_NOINLINE static sp_digit sp_384_sub_in_place_12(sp_digit* a,
         "movs	%[a], r2\n\t"
         : [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7"
     );
     return (word32)(size_t)a;
 }
@@ -114321,9 +114526,10 @@ SP_NOINLINE static sp_digit sp_384_sub_in_place_12(sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static sp_digit sp_384_sub_in_place_12(sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_384_sub_in_place_12(
+    sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "ldm	%[b]!, {r4, r5}\n\t"
         "ldr	r2, [%[a]]\n\t"
@@ -114440,7 +114646,7 @@ SP_NOINLINE static sp_digit sp_384_sub_in_place_12(sp_digit* a,
 #endif
         : [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r2", "r3", "r4", "r5", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5"
     );
     return (word32)(size_t)a;
 }
@@ -114452,9 +114658,10 @@ SP_NOINLINE static sp_digit sp_384_sub_in_place_12(sp_digit* a,
  * a  A single precision integer.
  * b  A single precision digit.
  */
-SP_NOINLINE static void sp_384_mul_d_12(sp_digit* r, const sp_digit* a,
-        sp_digit b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_384_mul_d_12(sp_digit* r,
+    const sp_digit* a, sp_digit b)
 {
+
     __asm__ __volatile__ (
         "movs	r6, #48\n\t"
 #if defined(__clang__) || defined(WOLFSSL_KEIL)
@@ -114628,7 +114835,7 @@ SP_NOINLINE static void sp_384_mul_d_12(sp_digit* r, const sp_digit* a,
         "str	r3, [%[r]]\n\t"
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9"
     );
 }
 
@@ -114641,9 +114848,10 @@ SP_NOINLINE static void sp_384_mul_d_12(sp_digit* r, const sp_digit* a,
  *
  * Note that this is an approximate div. It may give an answer 1 larger.
  */
-SP_NOINLINE static sp_digit div_384_word_12(sp_digit d1, sp_digit d0,
-        sp_digit div)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit div_384_word_12(sp_digit d1,
+    sp_digit d0, sp_digit div)
 {
+
     __asm__ __volatile__ (
         "movs	r3, #0\n\t"
 #if defined(__clang__) || defined(WOLFSSL_KEIL)
@@ -115243,7 +115451,7 @@ SP_NOINLINE static sp_digit div_384_word_12(sp_digit d1, sp_digit d0,
         "movs	%[d1], r3\n\t"
         : [d1] "+l" (d1), [d0] "+l" (d0), [div] "+l" (div)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9"
     );
     return (word32)(size_t)d1;
 }
@@ -115838,9 +116046,10 @@ int sp_ecc_sign_384_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, word32 hashLen, W
  * a  Number to divide.
  * m  Modulus.
  */
-static void sp_384_div2_mod_12(sp_digit* r, const sp_digit* a,
-        const sp_digit* m)
+WC_OMIT_FRAME_POINTER static void sp_384_div2_mod_12(sp_digit* r,
+    const sp_digit* a, const sp_digit* m)
 {
+
     __asm__ __volatile__ (
         "ldr	r3, [%[a]]\n\t"
 #if defined(__clang__) || defined(WOLFSSL_KEIL)
@@ -116234,12 +116443,13 @@ static void sp_384_div2_mod_12(sp_digit* r, const sp_digit* a,
         "str	r7, [%[r], #44]\n\t"
         : [r] "+l" (r), [a] "+l" (a), [m] "+l" (m)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7"
     );
 }
 
-static int sp_384_num_bits_12(sp_digit* a)
+WC_OMIT_FRAME_POINTER static int sp_384_num_bits_12(sp_digit* a)
 {
+
     static const byte sp_num_bits_table[256] = {
         0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
         5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
@@ -117433,7 +117643,7 @@ static int sp_384_num_bits_12(sp_digit* a)
         "movs	%[a], r2\n\t"
         : [a] "+l" (a), [table] "+l" (table)
         :
-        : "memory", "r2", "r3", "r4", "r5", "r6", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5", "r6"
     );
     return (word32)(size_t)a;
 }
@@ -118419,9 +118629,10 @@ static const sp_digit p521_b[17] = {
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static void sp_521_mul_17(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_521_mul_17(sp_digit* r,
+    const sp_digit* a, const sp_digit* b)
 {
+
     sp_digit t[17 * 2];
     sp_digit* tmp = t;
     __asm__ __volatile__ (
@@ -118651,8 +118862,10 @@ SP_NOINLINE static void sp_521_mul_17(sp_digit* r, const sp_digit* a,
         "mov	%[b], r10\n\t"
         : [a] "+l" (a), [b] "+l" (b), [tmp] "+l" (tmp)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
+            "r11", "r12"
     );
+    (void)r;
 
     XMEMCPY(r, t, sizeof(t));
 }
@@ -118662,8 +118875,10 @@ SP_NOINLINE static void sp_521_mul_17(sp_digit* r, const sp_digit* a,
  * r  A single precision integer.
  * a  A single precision integer.
  */
-SP_NOINLINE static void sp_521_sqr_17(sp_digit* r, const sp_digit* a)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_521_sqr_17(sp_digit* r,
+    const sp_digit* a)
 {
+
     __asm__ __volatile__ (
         "movs	r3, #0\n\t"
         "movs	r4, #0\n\t"
@@ -119076,7 +119291,8 @@ SP_NOINLINE static void sp_521_sqr_17(sp_digit* r, const sp_digit* a)
         "add	sp, sp, r6\n\t"
         : [r] "+l" (r), [a] "+l" (a)
         :
-        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
+            "r11", "r12"
     );
 }
 
@@ -119087,9 +119303,10 @@ SP_NOINLINE static void sp_521_sqr_17(sp_digit* r, const sp_digit* a)
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static sp_digit sp_521_add_17(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_521_add_17(sp_digit* r,
+    const sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "movs	r6, %[a]\n\t"
         "movs	r7, #0\n\t"
@@ -119149,7 +119366,7 @@ SP_NOINLINE static sp_digit sp_521_add_17(sp_digit* r, const sp_digit* a,
         "movs	%[r], r3\n\t"
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7"
     );
     return (word32)(size_t)r;
 }
@@ -119161,9 +119378,10 @@ SP_NOINLINE static sp_digit sp_521_add_17(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static sp_digit sp_521_add_17(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_521_add_17(sp_digit* r,
+    const sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "ldm	%[b]!, {r5, r6}\n\t"
         "ldm	%[a]!, {r3, r4}\n\t"
@@ -119319,7 +119537,7 @@ SP_NOINLINE static sp_digit sp_521_add_17(sp_digit* r, const sp_digit* a,
 #endif
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6"
     );
     return (word32)(size_t)r;
 }
@@ -119546,9 +119764,10 @@ static int sp_521_point_to_ecc_point_17(const sp_point_521* p, ecc_point* pm)
  * b  A single precision number to subtract.
  * m  Mask value to apply.
  */
-SP_NOINLINE static sp_digit sp_521_cond_sub_17(sp_digit* r, const sp_digit* a,
-        const sp_digit* b, sp_digit m)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_521_cond_sub_17(
+    sp_digit* r, const sp_digit* a, const sp_digit* b, sp_digit m)
 {
+
     __asm__ __volatile__ (
         "movs	r4, #0\n\t"
         "movs	r5, #0x44\n\t"
@@ -119596,7 +119815,7 @@ SP_NOINLINE static sp_digit sp_521_cond_sub_17(sp_digit* r, const sp_digit* a,
         "movs	%[r], r4\n\t"
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b), [m] "+l" (m)
         :
-        : "memory", "r4", "r5", "r6", "r7", "r8", "cc"
+        : "memory", "cc", "r4", "r5", "r6", "r7", "r8"
     );
     return (word32)(size_t)r;
 }
@@ -119607,9 +119826,10 @@ SP_NOINLINE static sp_digit sp_521_cond_sub_17(sp_digit* r, const sp_digit* a,
  * m   The single precision number representing the modulus.
  * mp  The digit representing the negative inverse of m mod 2^n.
  */
-SP_NOINLINE static void sp_521_mont_reduce_17(sp_digit* a, const sp_digit* m,
-        sp_digit mp)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_521_mont_reduce_17(sp_digit* a,
+    const sp_digit* m, sp_digit mp)
 {
+
     (void)mp;
     (void)m;
 
@@ -120244,8 +120464,10 @@ SP_NOINLINE static void sp_521_mont_reduce_17(sp_digit* a, const sp_digit* m,
         "add	sp, sp, #0x44\n\t"
         : [a] "+l" (a)
         :
-        : "memory", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "cc"
+        : "memory", "cc", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8"
     );
+    (void)m;
+    (void)mp;
 }
 
 /* Reduce the number back to 521 bits using Montgomery reduction.
@@ -120254,9 +120476,10 @@ SP_NOINLINE static void sp_521_mont_reduce_17(sp_digit* a, const sp_digit* m,
  * m   The single precision number representing the modulus.
  * mp  The digit representing the negative inverse of m mod 2^n.
  */
-SP_NOINLINE static void sp_521_mont_reduce_order_17(sp_digit* a,
-        const sp_digit* m, sp_digit mp)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_521_mont_reduce_order_17(
+    sp_digit* a, const sp_digit* m, sp_digit mp)
 {
+
     __asm__ __volatile__ (
         "movs	r7, #0\n\t"
         "mov	r8, %[mp]\n\t"
@@ -121411,7 +121634,8 @@ SP_NOINLINE static void sp_521_mont_reduce_order_17(sp_digit* a,
 #endif /* WOLFSSL_SP_LARGE_CODE */
         : [a] "+l" (a), [m] "+l" (m), [mp] "+l" (mp)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
+            "r11", "r12", "lr"
     );
 }
 
@@ -121561,8 +121785,10 @@ static void sp_521_mont_inv_17(sp_digit* r, const sp_digit* a, sp_digit* td)
  * return -ve, 0 or +ve if a is less than, equal to or greater than b
  * respectively.
  */
-SP_NOINLINE static sp_int32 sp_521_cmp_17(const sp_digit* a, const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_int32 sp_521_cmp_17(
+    const sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "movs	r2, #0\n\t"
         "movs	r3, #0\n\t"
@@ -121658,7 +121884,7 @@ SP_NOINLINE static sp_int32 sp_521_cmp_17(const sp_digit* a, const sp_digit* b)
         "movs	%[a], r2\n\t"
         : [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7"
     );
     return (word32)(size_t)a;
 }
@@ -121716,9 +121942,10 @@ static void sp_521_map_17(sp_point_521* r, const sp_point_521* p,
  * b   Second number to add in Montgomery form.
  * m   Modulus (prime).
  */
-SP_NOINLINE static void sp_521_mont_add_17(sp_digit* r, const sp_digit* a,
-        const sp_digit* b, const sp_digit* m)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_521_mont_add_17(sp_digit* r,
+    const sp_digit* a, const sp_digit* b, const sp_digit* m)
 {
+
     __asm__ __volatile__ (
         "ldm	%[a]!, {r4, r5}\n\t"
         "ldm	%[b]!, {r6, r7}\n\t"
@@ -122039,7 +122266,7 @@ SP_NOINLINE static void sp_521_mont_add_17(sp_digit* r, const sp_digit* a,
 #endif
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b), [m] "+l" (m)
         :
-        : "memory", "r4", "r5", "r6", "r7", "cc"
+        : "memory", "cc", "r4", "r5", "r6", "r7"
     );
 }
 
@@ -122049,9 +122276,10 @@ SP_NOINLINE static void sp_521_mont_add_17(sp_digit* r, const sp_digit* a,
  * a   Number to double in Montgomery form.
  * m   Modulus (prime).
  */
-SP_NOINLINE static void sp_521_mont_dbl_17(sp_digit* r, const sp_digit* a,
-        const sp_digit* m)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_521_mont_dbl_17(sp_digit* r,
+    const sp_digit* a, const sp_digit* m)
 {
+
     __asm__ __volatile__ (
         "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
 #if defined(__clang__) || defined(WOLFSSL_KEIL)
@@ -122355,7 +122583,7 @@ SP_NOINLINE static void sp_521_mont_dbl_17(sp_digit* r, const sp_digit* a,
 #endif
         : [r] "+l" (r), [a] "+l" (a), [m] "+l" (m)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7"
     );
 }
 
@@ -122365,9 +122593,10 @@ SP_NOINLINE static void sp_521_mont_dbl_17(sp_digit* r, const sp_digit* a,
  * a   Number to triple in Montgomery form.
  * m   Modulus (prime).
  */
-SP_NOINLINE static void sp_521_mont_tpl_17(sp_digit* r, const sp_digit* a,
-        const sp_digit* m)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_521_mont_tpl_17(sp_digit* r,
+    const sp_digit* a, const sp_digit* m)
 {
+
     __asm__ __volatile__ (
         "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
 #if defined(__clang__) || defined(WOLFSSL_KEIL)
@@ -122833,7 +123062,7 @@ SP_NOINLINE static void sp_521_mont_tpl_17(sp_digit* r, const sp_digit* a,
 #endif
         : [r] "+l" (r), [a] "+l" (a), [m] "+l" (m)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7"
     );
 }
 
@@ -122844,9 +123073,10 @@ SP_NOINLINE static void sp_521_mont_tpl_17(sp_digit* r, const sp_digit* a,
  * b   Number to subtract with in Montgomery form.
  * m   Modulus (prime).
  */
-SP_NOINLINE static void sp_521_mont_sub_17(sp_digit* r, const sp_digit* a,
-        const sp_digit* b, const sp_digit* m)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_521_mont_sub_17(sp_digit* r,
+    const sp_digit* a, const sp_digit* b, const sp_digit* m)
 {
+
     __asm__ __volatile__ (
         "ldm	%[a]!, {r4, r5}\n\t"
         "ldm	%[b]!, {r6, r7}\n\t"
@@ -123173,7 +123403,7 @@ SP_NOINLINE static void sp_521_mont_sub_17(sp_digit* r, const sp_digit* a,
 #endif
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b), [m] "+l" (m)
         :
-        : "memory", "r4", "r5", "r6", "r7", "cc"
+        : "memory", "cc", "r4", "r5", "r6", "r7"
     );
 }
 
@@ -123185,9 +123415,10 @@ SP_NOINLINE static void sp_521_mont_sub_17(sp_digit* r, const sp_digit* a,
  * b  A single precision number to add.
  * m  Mask value to apply.
  */
-SP_NOINLINE static sp_digit sp_521_cond_add_17(sp_digit* r, const sp_digit* a,
-        const sp_digit* b, sp_digit m)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_521_cond_add_17(
+    sp_digit* r, const sp_digit* a, const sp_digit* b, sp_digit m)
 {
+
     __asm__ __volatile__ (
         "movs	r4, #0\n\t"
         "movs	r5, #0x44\n\t"
@@ -123241,7 +123472,7 @@ SP_NOINLINE static sp_digit sp_521_cond_add_17(sp_digit* r, const sp_digit* a,
         "movs	%[r], r4\n\t"
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b), [m] "+l" (m)
         :
-        : "memory", "r4", "r5", "r6", "r7", "r8", "cc"
+        : "memory", "cc", "r4", "r5", "r6", "r7", "r8"
     );
     return (word32)(size_t)r;
 }
@@ -123251,8 +123482,10 @@ SP_NOINLINE static sp_digit sp_521_cond_add_17(sp_digit* r, const sp_digit* a,
  * r  A single precision integer.
  * a  A single precision integer.
  */
-static void sp_521_rshift1_17(sp_digit* r, const sp_digit* a)
+WC_OMIT_FRAME_POINTER static void sp_521_rshift1_17(sp_digit* r,
+    const sp_digit* a)
 {
+
     __asm__ __volatile__ (
         "ldr	r2, [%[a]]\n\t"
         "ldr	r3, [%[a], #4]\n\t"
@@ -123567,7 +123800,7 @@ static void sp_521_rshift1_17(sp_digit* r, const sp_digit* a)
         "str	r3, [%[r], #64]\n\t"
         : [r] "+l" (r), [a] "+l" (a)
         :
-        : "memory", "r2", "r3", "r4", "r5", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5"
     );
 }
 
@@ -123577,9 +123810,10 @@ static void sp_521_rshift1_17(sp_digit* r, const sp_digit* a)
  * a  Number to divide.
  * m  Modulus (prime).
  */
-SP_NOINLINE static void sp_521_mont_div2_17(sp_digit* r, const sp_digit* a,
-        const sp_digit* m)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_521_mont_div2_17(sp_digit* r,
+    const sp_digit* a, const sp_digit* m)
 {
+
     sp_digit o;
 
     o = sp_521_cond_add_17(r, a, m, 0 - (a[0] & 1));
@@ -127579,8 +127813,9 @@ int sp_ecc_mulmod_base_add_521(const mp_int* km, const ecc_point* am,
  *
  * a  A single precision integer.
  */
-SP_NOINLINE static void sp_521_add_one_17(sp_digit* a)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_521_add_one_17(sp_digit* a)
 {
+
     __asm__ __volatile__ (
         "movs	r2, #1\n\t"
         "ldr	r1, [%[a]]\n\t"
@@ -127737,7 +127972,7 @@ SP_NOINLINE static void sp_521_add_one_17(sp_digit* a)
         "str	r1, [%[a], #64]\n\t"
         : [a] "+l" (a)
         :
-        : "memory", "r1", "r2", "cc"
+        : "memory", "cc", "r1", "r2"
     );
 }
 
@@ -128089,8 +128324,10 @@ int sp_ecc_secret_gen_521_nb(sp_ecc_ctx_t* sp_ctx, const mp_int* priv,
  * a  A single precision integer.
  * n  Integer representing number of bits to shift.
  */
-static void sp_521_rshift_17(sp_digit* r, const sp_digit* a, byte n)
+WC_OMIT_FRAME_POINTER static void sp_521_rshift_17(sp_digit* r,
+    const sp_digit* a, byte n)
 {
+
     __asm__ __volatile__ (
         "movs	r7, #32\n\t"
 #ifdef WOLFSSL_KEIL
@@ -128431,7 +128668,7 @@ static void sp_521_rshift_17(sp_digit* r, const sp_digit* a, byte n)
         "str	r4, [%[r], #64]\n\t"
         : [r] "+l" (r), [a] "+l" (a), [n] "+l" (n)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7"
     );
 }
 
@@ -128445,8 +128682,10 @@ static void sp_521_rshift_17(sp_digit* r, const sp_digit* a, byte n)
  * a  A single precision integer.
  * n  Integer representing number of bits to shift.
  */
-static void sp_521_lshift_17(sp_digit* r, const sp_digit* a, byte n)
+WC_OMIT_FRAME_POINTER static void sp_521_lshift_17(sp_digit* r,
+    const sp_digit* a, byte n)
 {
+
     __asm__ __volatile__ (
         "movs	r7, #31\n\t"
 #if defined(__clang__) || defined(WOLFSSL_KEIL)
@@ -128858,7 +129097,7 @@ static void sp_521_lshift_17(sp_digit* r, const sp_digit* a, byte n)
         "str	r4, [%[r], #4]\n\t"
         : [r] "+l" (r), [a] "+l" (a), [n] "+l" (n)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7"
     );
 }
 
@@ -128868,8 +129107,10 @@ static void sp_521_lshift_17(sp_digit* r, const sp_digit* a, byte n)
  * a  A single precision integer.
  * n  Integer representing number of bits to shift.
  */
-static void sp_521_lshift_34(sp_digit* r, const sp_digit* a, byte n)
+WC_OMIT_FRAME_POINTER static void sp_521_lshift_34(sp_digit* r,
+    const sp_digit* a, byte n)
 {
+
     __asm__ __volatile__ (
         "movs	r7, #31\n\t"
 #if defined(__clang__) || defined(WOLFSSL_KEIL)
@@ -129719,7 +129960,7 @@ static void sp_521_lshift_34(sp_digit* r, const sp_digit* a, byte n)
         "str	r5, [%[r], #4]\n\t"
         : [r] "+l" (r), [a] "+l" (a), [n] "+l" (n)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7"
     );
 }
 
@@ -129729,9 +129970,10 @@ static void sp_521_lshift_34(sp_digit* r, const sp_digit* a, byte n)
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static sp_digit sp_521_sub_in_place_17(sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_521_sub_in_place_17(
+    sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "movs	r7, %[a]\n\t"
         "movs	r2, #0\n\t"
@@ -129790,7 +130032,7 @@ SP_NOINLINE static sp_digit sp_521_sub_in_place_17(sp_digit* a,
         "movs	%[a], r2\n\t"
         : [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7"
     );
     return (word32)(size_t)a;
 }
@@ -129801,9 +130043,10 @@ SP_NOINLINE static sp_digit sp_521_sub_in_place_17(sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static sp_digit sp_521_sub_in_place_17(sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_521_sub_in_place_17(
+    sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "ldm	%[b]!, {r4, r5}\n\t"
         "ldr	r2, [%[a]]\n\t"
@@ -129966,7 +130209,7 @@ SP_NOINLINE static sp_digit sp_521_sub_in_place_17(sp_digit* a,
 #endif
         : [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r2", "r3", "r4", "r5", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5"
     );
     return (word32)(size_t)a;
 }
@@ -129978,9 +130221,10 @@ SP_NOINLINE static sp_digit sp_521_sub_in_place_17(sp_digit* a,
  * a  A single precision integer.
  * b  A single precision digit.
  */
-SP_NOINLINE static void sp_521_mul_d_17(sp_digit* r, const sp_digit* a,
-        sp_digit b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_521_mul_d_17(sp_digit* r,
+    const sp_digit* a, sp_digit b)
 {
+
     __asm__ __volatile__ (
         "movs	r6, #0x44\n\t"
 #if defined(__clang__) || defined(WOLFSSL_KEIL)
@@ -130154,7 +130398,7 @@ SP_NOINLINE static void sp_521_mul_d_17(sp_digit* r, const sp_digit* a,
         "str	r3, [%[r]]\n\t"
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9"
     );
 }
 
@@ -130167,9 +130411,10 @@ SP_NOINLINE static void sp_521_mul_d_17(sp_digit* r, const sp_digit* a,
  *
  * Note that this is an approximate div. It may give an answer 1 larger.
  */
-SP_NOINLINE static sp_digit div_521_word_17(sp_digit d1, sp_digit d0,
-        sp_digit div)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit div_521_word_17(sp_digit d1,
+    sp_digit d0, sp_digit div)
 {
+
     __asm__ __volatile__ (
         "movs	r3, #0\n\t"
 #if defined(__clang__) || defined(WOLFSSL_KEIL)
@@ -130769,7 +131014,7 @@ SP_NOINLINE static sp_digit div_521_word_17(sp_digit d1, sp_digit d0,
         "movs	%[d1], r3\n\t"
         : [d1] "+l" (d1), [d0] "+l" (d0), [div] "+l" (div)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9"
     );
     return (word32)(size_t)d1;
 }
@@ -131395,9 +131640,10 @@ int sp_ecc_sign_521_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, word32 hashLen, W
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static sp_digit sp_521_sub_17(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_521_sub_17(sp_digit* r,
+    const sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "movs	r6, %[a]\n\t"
         "movs	r3, #0\n\t"
@@ -131451,7 +131697,7 @@ SP_NOINLINE static sp_digit sp_521_sub_17(sp_digit* r, const sp_digit* a,
         "movs	%[r], r3\n\t"
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6"
     );
     return (word32)(size_t)r;
 }
@@ -131463,9 +131709,10 @@ SP_NOINLINE static sp_digit sp_521_sub_17(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static sp_digit sp_521_sub_17(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_521_sub_17(sp_digit* r,
+    const sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "ldm	%[b]!, {r5, r6}\n\t"
         "ldm	%[a]!, {r3, r4}\n\t"
@@ -131620,7 +131867,7 @@ SP_NOINLINE static sp_digit sp_521_sub_17(sp_digit* r, const sp_digit* a,
 #endif
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6"
     );
     return (word32)(size_t)r;
 }
@@ -131632,9 +131879,10 @@ SP_NOINLINE static sp_digit sp_521_sub_17(sp_digit* r, const sp_digit* a,
  * a  Number to divide.
  * m  Modulus.
  */
-static void sp_521_div2_mod_17(sp_digit* r, const sp_digit* a,
-        const sp_digit* m)
+WC_OMIT_FRAME_POINTER static void sp_521_div2_mod_17(sp_digit* r,
+    const sp_digit* a, const sp_digit* m)
 {
+
     __asm__ __volatile__ (
         "ldr	r3, [%[a]]\n\t"
 #if defined(__clang__) || defined(WOLFSSL_KEIL)
@@ -132183,12 +132431,13 @@ static void sp_521_div2_mod_17(sp_digit* r, const sp_digit* a,
         "str	r5, [%[r], #64]\n\t"
         : [r] "+l" (r), [a] "+l" (a), [m] "+l" (m)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7"
     );
 }
 
-static int sp_521_num_bits_17(sp_digit* a)
+WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(sp_digit* a)
 {
+
     static const byte sp_num_bits_table[256] = {
         0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
         5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
@@ -133952,7 +134201,7 @@ static int sp_521_num_bits_17(sp_digit* a)
         "movs	%[a], r2\n\t"
         : [a] "+l" (a), [table] "+l" (table)
         :
-        : "memory", "r2", "r3", "r4", "r5", "r6", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5", "r6"
     );
     return (word32)(size_t)a;
 }
@@ -134820,9 +135069,10 @@ typedef struct sp_point_1024 {
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static void sp_1024_mul_16(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_1024_mul_16(sp_digit* r,
+    const sp_digit* a, const sp_digit* b)
 {
+
     sp_digit t[16 * 2];
     sp_digit* tmp = t;
     __asm__ __volatile__ (
@@ -135052,8 +135302,10 @@ SP_NOINLINE static void sp_1024_mul_16(sp_digit* r, const sp_digit* a,
         "mov	%[b], r10\n\t"
         : [a] "+l" (a), [b] "+l" (b), [tmp] "+l" (tmp)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
+            "r11", "r12"
     );
+    (void)r;
 
     XMEMCPY(r, t, sizeof(t));
 }
@@ -135065,9 +135317,10 @@ SP_NOINLINE static void sp_1024_mul_16(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static void sp_1024_mul_16(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_1024_mul_16(sp_digit* r,
+    const sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "sub	sp, sp, #0x40\n\t"
         "mov	r8, %[r]\n\t"
@@ -170940,7 +171193,7 @@ SP_NOINLINE static void sp_1024_mul_16(sp_digit* r, const sp_digit* a,
         "stm	%[r]!, {r3, r4, r5, r6}\n\t"
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
 }
 
@@ -170951,8 +171204,10 @@ SP_NOINLINE static void sp_1024_mul_16(sp_digit* r, const sp_digit* a,
  * r  A single precision integer.
  * a  A single precision integer.
  */
-SP_NOINLINE static void sp_1024_sqr_16(sp_digit* r, const sp_digit* a)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_1024_sqr_16(sp_digit* r,
+    const sp_digit* a)
 {
+
     __asm__ __volatile__ (
         "movs	r3, #0\n\t"
         "movs	r4, #0\n\t"
@@ -171365,7 +171620,8 @@ SP_NOINLINE static void sp_1024_sqr_16(sp_digit* r, const sp_digit* a)
         "add	sp, sp, r6\n\t"
         : [r] "+l" (r), [a] "+l" (a)
         :
-        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
+            "r11", "r12"
     );
 }
 
@@ -171375,8 +171631,10 @@ SP_NOINLINE static void sp_1024_sqr_16(sp_digit* r, const sp_digit* a)
  * r  A single precision integer.
  * a  A single precision integer.
  */
-SP_NOINLINE static void sp_1024_sqr_16(sp_digit* r, const sp_digit* a)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_1024_sqr_16(sp_digit* r,
+    const sp_digit* a)
 {
+
     __asm__ __volatile__ (
         "sub	sp, sp, #0x40\n\t"
         "mov	r8, %[r]\n\t"
@@ -197720,7 +197978,8 @@ SP_NOINLINE static void sp_1024_sqr_16(sp_digit* r, const sp_digit* a)
         "stm	%[r]!, {r2, r3, r4, r5}\n\t"
         : [r] "+l" (r), [a] "+l" (a)
         :
-        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
+            "r11", "r12", "lr"
     );
 }
 
@@ -197731,9 +197990,10 @@ SP_NOINLINE static void sp_1024_sqr_16(sp_digit* r, const sp_digit* a)
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static sp_digit sp_1024_add_16(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_1024_add_16(sp_digit* r,
+    const sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "ldm	%[b]!, {r5, r6}\n\t"
         "ldm	%[a]!, {r3, r4}\n\t"
@@ -197879,7 +198139,7 @@ SP_NOINLINE static sp_digit sp_1024_add_16(sp_digit* r, const sp_digit* a,
 #endif
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6"
     );
     return (word32)(size_t)r;
 }
@@ -197890,9 +198150,10 @@ SP_NOINLINE static sp_digit sp_1024_add_16(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static sp_digit sp_1024_add_word_16(sp_digit* r, const sp_digit* a,
-        sp_digit b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_1024_add_word_16(
+    sp_digit* r, const sp_digit* a, sp_digit b)
 {
+
     __asm__ __volatile__ (
         "movs	r5, #0\n\t"
         "ldm	%[a]!, {r3, r4}\n\t"
@@ -198031,7 +198292,7 @@ SP_NOINLINE static sp_digit sp_1024_add_word_16(sp_digit* r, const sp_digit* a,
 #endif
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r3", "r4", "r5", "cc"
+        : "memory", "cc", "r3", "r4", "r5"
     );
     return (word32)(size_t)r;
 }
@@ -198041,9 +198302,10 @@ SP_NOINLINE static sp_digit sp_1024_add_word_16(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static sp_digit sp_1024_sub_in_place_32(sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_1024_sub_in_place_32(
+    sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "ldm	%[b]!, {r4, r5}\n\t"
         "ldr	r2, [%[a]]\n\t"
@@ -198340,7 +198602,7 @@ SP_NOINLINE static sp_digit sp_1024_sub_in_place_32(sp_digit* a,
 #endif
         : [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r2", "r3", "r4", "r5", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5"
     );
     return (word32)(size_t)a;
 }
@@ -198351,9 +198613,10 @@ SP_NOINLINE static sp_digit sp_1024_sub_in_place_32(sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static sp_digit sp_1024_add_32(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_1024_add_32(sp_digit* r,
+    const sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "ldm	%[b]!, {r5, r6}\n\t"
         "ldm	%[a]!, {r3, r4}\n\t"
@@ -198635,7 +198898,7 @@ SP_NOINLINE static sp_digit sp_1024_add_32(sp_digit* r, const sp_digit* a,
 #endif
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6"
     );
     return (word32)(size_t)r;
 }
@@ -198713,9 +198976,10 @@ SP_NOINLINE static void sp_1024_mul_32(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static sp_digit sp_1024_sub_16(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_1024_sub_16(sp_digit* r,
+    const sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "ldm	%[b]!, {r5, r6}\n\t"
         "ldm	%[a]!, {r3, r4}\n\t"
@@ -198860,7 +199124,7 @@ SP_NOINLINE static sp_digit sp_1024_sub_16(sp_digit* r, const sp_digit* a,
 #endif
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6"
     );
     return (word32)(size_t)r;
 }
@@ -198907,9 +199171,10 @@ SP_NOINLINE static void sp_1024_sqr_32(sp_digit* r, const sp_digit* a)
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static void sp_1024_mul_32(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_1024_mul_32(sp_digit* r,
+    const sp_digit* a, const sp_digit* b)
 {
+
     sp_digit t[32 * 2];
     sp_digit* tmp = t;
     __asm__ __volatile__ (
@@ -199139,8 +199404,10 @@ SP_NOINLINE static void sp_1024_mul_32(sp_digit* r, const sp_digit* a,
         "mov	%[b], r10\n\t"
         : [a] "+l" (a), [b] "+l" (b), [tmp] "+l" (tmp)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
+            "r11", "r12"
     );
+    (void)r;
 
     XMEMCPY(r, t, sizeof(t));
 }
@@ -199150,8 +199417,10 @@ SP_NOINLINE static void sp_1024_mul_32(sp_digit* r, const sp_digit* a,
  * r  A single precision integer.
  * a  A single precision integer.
  */
-SP_NOINLINE static void sp_1024_sqr_32(sp_digit* r, const sp_digit* a)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_1024_sqr_32(sp_digit* r,
+    const sp_digit* a)
 {
+
     __asm__ __volatile__ (
         "movs	r3, #0\n\t"
         "movs	r4, #0\n\t"
@@ -199574,7 +199843,8 @@ SP_NOINLINE static void sp_1024_sqr_32(sp_digit* r, const sp_digit* a)
         "add	sp, sp, r6\n\t"
         : [r] "+l" (r), [a] "+l" (a)
         :
-        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
+            "r11", "r12"
     );
 }
 
@@ -199670,9 +199940,10 @@ static const sp_point_1024 p1024_base = {
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static sp_digit sp_1024_sub_in_place_32(sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_1024_sub_in_place_32(
+    sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "movs	r7, %[a]\n\t"
         "movs	r2, #0\n\t"
@@ -199731,7 +200002,7 @@ SP_NOINLINE static sp_digit sp_1024_sub_in_place_32(sp_digit* a,
         "movs	%[a], r2\n\t"
         : [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7"
     );
     return (word32)(size_t)a;
 }
@@ -199745,9 +200016,10 @@ SP_NOINLINE static sp_digit sp_1024_sub_in_place_32(sp_digit* a,
  * b  A single precision number to subtract.
  * m  Mask value to apply.
  */
-SP_NOINLINE static sp_digit sp_1024_cond_sub_32(sp_digit* r, const sp_digit* a,
-        const sp_digit* b, sp_digit m)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_1024_cond_sub_32(
+    sp_digit* r, const sp_digit* a, const sp_digit* b, sp_digit m)
 {
+
     __asm__ __volatile__ (
         "movs	r4, #0\n\t"
         "movs	r5, #0x80\n\t"
@@ -199795,7 +200067,7 @@ SP_NOINLINE static sp_digit sp_1024_cond_sub_32(sp_digit* r, const sp_digit* a,
         "movs	%[r], r4\n\t"
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b), [m] "+l" (m)
         :
-        : "memory", "r4", "r5", "r6", "r7", "r8", "cc"
+        : "memory", "cc", "r4", "r5", "r6", "r7", "r8"
     );
     return (word32)(size_t)r;
 }
@@ -199807,9 +200079,10 @@ SP_NOINLINE static sp_digit sp_1024_cond_sub_32(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static sp_digit sp_1024_add_32(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_1024_add_32(sp_digit* r,
+    const sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "movs	r6, %[a]\n\t"
         "movs	r7, #0\n\t"
@@ -199869,7 +200142,7 @@ SP_NOINLINE static sp_digit sp_1024_add_32(sp_digit* r, const sp_digit* a,
         "movs	%[r], r3\n\t"
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7"
     );
     return (word32)(size_t)r;
 }
@@ -199881,9 +200154,10 @@ SP_NOINLINE static sp_digit sp_1024_add_32(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision digit.
  */
-SP_NOINLINE static void sp_1024_mul_d_32(sp_digit* r, const sp_digit* a,
-        sp_digit b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_1024_mul_d_32(sp_digit* r,
+    const sp_digit* a, sp_digit b)
 {
+
     __asm__ __volatile__ (
         "movs	r6, #0x80\n\t"
 #if defined(__clang__) || defined(WOLFSSL_KEIL)
@@ -200057,7 +200331,7 @@ SP_NOINLINE static void sp_1024_mul_d_32(sp_digit* r, const sp_digit* a,
         "str	r3, [%[r]]\n\t"
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9"
     );
 }
 
@@ -200070,9 +200344,10 @@ SP_NOINLINE static void sp_1024_mul_d_32(sp_digit* r, const sp_digit* a,
  *
  * Note that this is an approximate div. It may give an answer 1 larger.
  */
-SP_NOINLINE static sp_digit div_1024_word_32(sp_digit d1, sp_digit d0,
-        sp_digit div)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit div_1024_word_32(sp_digit d1,
+    sp_digit d0, sp_digit div)
 {
+
     __asm__ __volatile__ (
         "movs	r3, #0\n\t"
 #if defined(__clang__) || defined(WOLFSSL_KEIL)
@@ -200672,7 +200947,7 @@ SP_NOINLINE static sp_digit div_1024_word_32(sp_digit d1, sp_digit d0,
         "movs	%[d1], r3\n\t"
         : [d1] "+l" (d1), [d0] "+l" (d0), [div] "+l" (div)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9"
     );
     return (word32)(size_t)d1;
 }
@@ -200714,8 +200989,10 @@ static void sp_1024_mask_32(sp_digit* r, const sp_digit* a, sp_digit m)
  * return -ve, 0 or +ve if a is less than, equal to or greater than b
  * respectively.
  */
-SP_NOINLINE static sp_int32 sp_1024_cmp_32(const sp_digit* a, const sp_digit* b)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_int32 sp_1024_cmp_32(
+    const sp_digit* a, const sp_digit* b)
 {
+
     __asm__ __volatile__ (
         "movs	r2, #0\n\t"
         "movs	r3, #0\n\t"
@@ -200811,7 +201088,7 @@ SP_NOINLINE static sp_int32 sp_1024_cmp_32(const sp_digit* a, const sp_digit* b)
         "movs	%[a], r2\n\t"
         : [a] "+l" (a), [b] "+l" (b)
         :
-        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7"
     );
     return (word32)(size_t)a;
 }
@@ -201153,9 +201430,10 @@ static int sp_1024_point_to_ecc_point_32(const sp_point_1024* p, ecc_point* pm)
  * m   The single precision number representing the modulus.
  * mp  The digit representing the negative inverse of m mod 2^n.
  */
-SP_NOINLINE static void sp_1024_mont_reduce_32(sp_digit* a, const sp_digit* m,
-        sp_digit mp)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_1024_mont_reduce_32(
+    sp_digit* a, const sp_digit* m, sp_digit mp)
 {
+
     __asm__ __volatile__ (
         "movs	r7, #0\n\t"
         "mov	r8, %[mp]\n\t"
@@ -202229,7 +202507,8 @@ SP_NOINLINE static void sp_1024_mont_reduce_32(sp_digit* a, const sp_digit* m,
 #endif /* WOLFSSL_SP_LARGE_CODE */
         : [a] "+l" (a), [m] "+l" (m), [mp] "+l" (mp)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
+            "r11", "r12", "lr"
     );
 }
 
@@ -202377,9 +202656,10 @@ static void sp_1024_map_32(sp_point_1024* r, const sp_point_1024* p,
  * b   Second number to add in Montgomery form.
  * m   Modulus (prime).
  */
-SP_NOINLINE static void sp_1024_mont_add_32(sp_digit* r, const sp_digit* a,
-        const sp_digit* b, const sp_digit* m)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_1024_mont_add_32(sp_digit* r,
+    const sp_digit* a, const sp_digit* b, const sp_digit* m)
 {
+
     __asm__ __volatile__ (
         "ldr	r4, [%[a]]\n\t"
         "ldr	r5, [%[a], #4]\n\t"
@@ -203276,7 +203556,7 @@ SP_NOINLINE static void sp_1024_mont_add_32(sp_digit* r, const sp_digit* a,
         "str	r5, [%[r], #124]\n\t"
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b), [m] "+l" (m)
         :
-        : "memory", "r4", "r5", "r6", "r7", "cc"
+        : "memory", "cc", "r4", "r5", "r6", "r7"
     );
 }
 
@@ -203286,9 +203566,10 @@ SP_NOINLINE static void sp_1024_mont_add_32(sp_digit* r, const sp_digit* a,
  * a   Number to double in Montgomery form.
  * m   Modulus (prime).
  */
-SP_NOINLINE static void sp_1024_mont_dbl_32(sp_digit* r, const sp_digit* a,
-        const sp_digit* m)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_1024_mont_dbl_32(sp_digit* r,
+    const sp_digit* a, const sp_digit* m)
 {
+
     __asm__ __volatile__ (
         "ldr	r4, [%[a]]\n\t"
         "ldr	r5, [%[a], #4]\n\t"
@@ -204153,7 +204434,7 @@ SP_NOINLINE static void sp_1024_mont_dbl_32(sp_digit* r, const sp_digit* a,
         "str	r5, [%[r], #124]\n\t"
         : [r] "+l" (r), [a] "+l" (a), [m] "+l" (m)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7"
     );
 }
 
@@ -204163,9 +204444,10 @@ SP_NOINLINE static void sp_1024_mont_dbl_32(sp_digit* r, const sp_digit* a,
  * a   Number to triple in Montgomery form.
  * m   Modulus (prime).
  */
-SP_NOINLINE static void sp_1024_mont_tpl_32(sp_digit* r, const sp_digit* a,
-        const sp_digit* m)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_1024_mont_tpl_32(sp_digit* r,
+    const sp_digit* a, const sp_digit* m)
 {
+
     __asm__ __volatile__ (
         "ldr	r4, [%[a]]\n\t"
         "ldr	r5, [%[a], #4]\n\t"
@@ -205923,7 +206205,7 @@ SP_NOINLINE static void sp_1024_mont_tpl_32(sp_digit* r, const sp_digit* a,
         "str	r7, [%[r], #124]\n\t"
         : [r] "+l" (r), [a] "+l" (a), [m] "+l" (m)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7"
     );
 }
 
@@ -205934,9 +206216,10 @@ SP_NOINLINE static void sp_1024_mont_tpl_32(sp_digit* r, const sp_digit* a,
  * b   Number to subtract with in Montgomery form.
  * m   Modulus (prime).
  */
-SP_NOINLINE static void sp_1024_mont_sub_32(sp_digit* r, const sp_digit* a,
-        const sp_digit* b, const sp_digit* m)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_1024_mont_sub_32(sp_digit* r,
+    const sp_digit* a, const sp_digit* b, const sp_digit* m)
 {
+
     __asm__ __volatile__ (
         "ldr	r4, [%[a]]\n\t"
         "ldr	r5, [%[a], #4]\n\t"
@@ -207357,7 +207640,7 @@ SP_NOINLINE static void sp_1024_mont_sub_32(sp_digit* r, const sp_digit* a,
         "str	r5, [%[r], #124]\n\t"
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b), [m] "+l" (m)
         :
-        : "memory", "r4", "r5", "r6", "r7", "cc"
+        : "memory", "cc", "r4", "r5", "r6", "r7"
     );
 }
 
@@ -207369,9 +207652,10 @@ SP_NOINLINE static void sp_1024_mont_sub_32(sp_digit* r, const sp_digit* a,
  * b  A single precision number to add.
  * m  Mask value to apply.
  */
-SP_NOINLINE static sp_digit sp_1024_cond_add_32(sp_digit* r, const sp_digit* a,
-        const sp_digit* b, sp_digit m)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit sp_1024_cond_add_32(
+    sp_digit* r, const sp_digit* a, const sp_digit* b, sp_digit m)
 {
+
     __asm__ __volatile__ (
         "movs	r4, #0\n\t"
         "movs	r5, #0x80\n\t"
@@ -207425,7 +207709,7 @@ SP_NOINLINE static sp_digit sp_1024_cond_add_32(sp_digit* r, const sp_digit* a,
         "movs	%[r], r4\n\t"
         : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b), [m] "+l" (m)
         :
-        : "memory", "r4", "r5", "r6", "r7", "r8", "cc"
+        : "memory", "cc", "r4", "r5", "r6", "r7", "r8"
     );
     return (word32)(size_t)r;
 }
@@ -207435,8 +207719,10 @@ SP_NOINLINE static sp_digit sp_1024_cond_add_32(sp_digit* r, const sp_digit* a,
  * r  A single precision integer.
  * a  A single precision integer.
  */
-static void sp_1024_rshift1_32(sp_digit* r, const sp_digit* a)
+WC_OMIT_FRAME_POINTER static void sp_1024_rshift1_32(sp_digit* r,
+    const sp_digit* a)
 {
+
     __asm__ __volatile__ (
         "ldr	r2, [%[a]]\n\t"
         "ldr	r3, [%[a], #4]\n\t"
@@ -208036,7 +208322,7 @@ static void sp_1024_rshift1_32(sp_digit* r, const sp_digit* a)
         "str	r3, [%[r], #124]\n\t"
         : [r] "+l" (r), [a] "+l" (a)
         :
-        : "memory", "r2", "r3", "r4", "r5", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5"
     );
 }
 
@@ -208046,9 +208332,10 @@ static void sp_1024_rshift1_32(sp_digit* r, const sp_digit* a)
  * a  Number to divide.
  * m  Modulus (prime).
  */
-SP_NOINLINE static void sp_1024_mont_div2_32(sp_digit* r, const sp_digit* a,
-        const sp_digit* m)
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_1024_mont_div2_32(sp_digit* r,
+    const sp_digit* a, const sp_digit* m)
 {
+
     sp_digit o;
 
     o = sp_1024_cond_add_32(r, a, m, 0 - (a[0] & 1));
diff --git a/wolfcrypt/src/sp_cortexm.c b/wolfcrypt/src/sp_cortexm.c
index 08b3b31a659..9a334f4d4a4 100644
--- a/wolfcrypt/src/sp_cortexm.c
+++ b/wolfcrypt/src/sp_cortexm.c
@@ -303,10 +303,10 @@ static void sp_2048_to_bin_64(sp_digit* r, byte* a)
  * b  A single precision integer.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_2048_mul_8(sp_digit* r_p,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_2048_mul_8(sp_digit* r_p,
     const sp_digit* a_p, const sp_digit* b_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_2048_mul_8(sp_digit* r,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_2048_mul_8(sp_digit* r,
     const sp_digit* a, const sp_digit* b)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -650,8 +650,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_2048_mul_8(sp_digit* r,
         "SUB	%[r], %[r], #0x20\n\t"
         "STM	%[r], {r3, r4, r5, r6, r7, r8, r9, r10}\n\t"
         "ADD	sp, sp, #0x24\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11", "r12", "lr"
     );
@@ -665,10 +670,10 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_2048_mul_8(sp_digit* r,
  * b  A single precision integer.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_2048_mul_8(sp_digit* r_p,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_2048_mul_8(sp_digit* r_p,
     const sp_digit* a_p, const sp_digit* b_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_2048_mul_8(sp_digit* r,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_2048_mul_8(sp_digit* r,
     const sp_digit* a, const sp_digit* b)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -790,8 +795,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_2048_mul_8(sp_digit* r,
         "LDM	sp, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t"
         "STM	lr, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t"
         "ADD	sp, sp, #0x2c\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r10", "r11", "r12", "r7",
             "r8", "r9", "lr"
     );
@@ -835,8 +845,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_2048_add_8(sp_digit* r,
         "STM	%[r]!, {r3, r4, r5, r6}\n\t"
         "MOV	%[r], #0x0\n\t"
         "ADC	%[r], %[r], #0x0\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (word32)(size_t)r;
@@ -890,8 +905,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_2048_sub_in_place_16(sp_digit* a,
         "SBCS	r5, r5, r9\n\t"
         "STM	%[a]!, {r2, r3, r4, r5}\n\t"
         "SBC	%[a], r9, r9\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9"
     );
     return (word32)(size_t)a;
@@ -948,8 +968,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_2048_add_16(sp_digit* r,
         "STM	%[r]!, {r3, r4, r5, r6}\n\t"
         "MOV	%[r], #0x0\n\t"
         "ADC	%[r], %[r], #0x0\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (word32)(size_t)r;
@@ -1096,8 +1121,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_2048_sub_in_place_32(sp_digit* a,
         "SBCS	r5, r5, r9\n\t"
         "STM	%[a]!, {r2, r3, r4, r5}\n\t"
         "SBC	%[a], r9, r9\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9"
     );
     return (word32)(size_t)a;
@@ -1182,8 +1212,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_2048_add_32(sp_digit* r,
         "STM	%[r]!, {r3, r4, r5, r6}\n\t"
         "MOV	%[r], #0x0\n\t"
         "ADC	%[r], %[r], #0x0\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (word32)(size_t)r;
@@ -1390,8 +1425,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_2048_sub_in_place_64(sp_digit* a,
         "SBCS	r5, r5, r9\n\t"
         "STM	%[a]!, {r2, r3, r4, r5}\n\t"
         "SBC	%[a], r9, r9\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9"
     );
     return (word32)(size_t)a;
@@ -1532,8 +1572,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_2048_add_64(sp_digit* r,
         "STM	%[r]!, {r3, r4, r5, r6}\n\t"
         "MOV	%[r], #0x0\n\t"
         "ADC	%[r], %[r], #0x0\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (word32)(size_t)r;
@@ -1615,10 +1660,10 @@ SP_NOINLINE static void sp_2048_mul_64(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_2048_sqr_8(sp_digit* r_p,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_2048_sqr_8(sp_digit* r_p,
     const sp_digit* a_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_2048_sqr_8(sp_digit* r,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_2048_sqr_8(sp_digit* r,
     const sp_digit* a)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -1853,8 +1898,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_2048_sqr_8(sp_digit* r,
         "SUB	%[r], %[r], #0x20\n\t"
         "STM	%[r], {r3, r4, r5, r6, r7, r8, r9, r10}\n\t"
         "ADD	sp, sp, #0x44\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11", "r12", "lr"
     );
@@ -1867,10 +1917,10 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_2048_sqr_8(sp_digit* r,
  * a  A single precision integer.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_2048_sqr_8(sp_digit* r_p,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_2048_sqr_8(sp_digit* r_p,
     const sp_digit* a_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_2048_sqr_8(sp_digit* r,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_2048_sqr_8(sp_digit* r,
     const sp_digit* a)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -1973,8 +2023,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_2048_sqr_8(sp_digit* r,
         "LDM	sp, {r0, r1, r2, r3, r4, r5, r6}\n\t"
         "STM	lr, {r0, r1, r2, r3, r4, r5, r6}\n\t"
         "ADD	sp, sp, #0x20\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11", "r12", "lr"
     );
@@ -2017,8 +2072,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_2048_sub_8(sp_digit* r,
         "SBCS	r6, r6, r10\n\t"
         "STM	%[r]!, {r3, r4, r5, r6}\n\t"
         "SBC	%[r], r6, r6\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (word32)(size_t)r;
@@ -2110,8 +2170,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_2048_sub_16(sp_digit* r,
         "SBCS	r6, r6, r10\n\t"
         "STM	%[r]!, {r3, r4, r5, r6}\n\t"
         "SBC	%[r], r6, r6\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (word32)(size_t)r;
@@ -2231,8 +2296,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_2048_sub_32(sp_digit* r,
         "SBCS	r6, r6, r10\n\t"
         "STM	%[r]!, {r3, r4, r5, r6}\n\t"
         "SBC	%[r], r6, r6\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (word32)(size_t)r;
@@ -2324,8 +2394,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_2048_add_64(sp_digit* r,
         "BNE.N	L_sp_2048_add_64_word_%=\n\t"
 #endif
         "MOV	%[r], r3\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
             "r3", "r12"
     );
@@ -2379,8 +2454,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_2048_sub_in_place_64(sp_digit* a,
         "BNE.N	L_sp_2048_sub_in_place_64_word_%=\n\t"
 #endif
         "MOV	%[a], r10\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11"
     );
@@ -2510,8 +2590,13 @@ WC_OMIT_FRAME_POINTER static void sp_2048_mul_64(sp_digit* r, const sp_digit* a,
 #else
         "BGT.N	L_sp_2048_mul_64_store_%=\n\t"
 #endif
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr",
             "r11"
     );
@@ -2629,8 +2714,13 @@ WC_OMIT_FRAME_POINTER static void sp_2048_sqr_64(sp_digit* r, const sp_digit* a)
 #else
         "BGT.N	L_sp_2048_sqr_64_store_%=\n\t"
 #endif
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr",
             "r11"
     );
@@ -2704,8 +2794,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_2048_add_32(sp_digit* r,
         "BNE.N	L_sp_2048_add_32_word_%=\n\t"
 #endif
         "MOV	%[r], r3\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
             "r3", "r12"
     );
@@ -2759,8 +2854,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_2048_sub_in_place_32(sp_digit* a,
         "BNE.N	L_sp_2048_sub_in_place_32_word_%=\n\t"
 #endif
         "MOV	%[a], r10\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11"
     );
@@ -2890,8 +2990,13 @@ WC_OMIT_FRAME_POINTER static void sp_2048_mul_32(sp_digit* r, const sp_digit* a,
 #else
         "BGT.N	L_sp_2048_mul_32_store_%=\n\t"
 #endif
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr",
             "r11"
     );
@@ -3009,8 +3114,13 @@ WC_OMIT_FRAME_POINTER static void sp_2048_sqr_32(sp_digit* r, const sp_digit* a)
 #else
         "BGT.N	L_sp_2048_sqr_32_store_%=\n\t"
 #endif
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr",
             "r11"
     );
@@ -3094,8 +3204,13 @@ WC_OMIT_FRAME_POINTER static void sp_2048_mul_d_64(sp_digit* r,
         "BLT.N	L_sp_2048_mul_d_64_word_%=\n\t"
 #endif
         "STR	r3, [%[r], #256]\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9"
     );
 }
@@ -3442,8 +3557,13 @@ WC_OMIT_FRAME_POINTER static void sp_2048_mul_d_64(sp_digit* r,
         "UMLAL	r3, r4, %[b], r8\n\t"
         "STM	%[r]!, {r3}\n\t"
         "STR	r4, [%[r]]\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8"
     );
 }
@@ -3515,8 +3635,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_2048_cond_sub_32(sp_digit* r,
         "BLT.N	L_sp_2048_cond_sub_32_words_%=\n\t"
 #endif
         "MOV	%[r], r4\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8"
     );
     return (word32)(size_t)r;
@@ -3661,8 +3786,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_2048_cond_sub_32(sp_digit* r,
         "SBCS	r7, r7, r9\n\t"
         "STM	%[r]!, {r6, r7}\n\t"
         "SBC	%[r], r5, r5\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9"
     );
     return (word32)(size_t)r;
@@ -3678,10 +3808,10 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_2048_cond_sub_32(sp_digit* r,
  * mp  The digit representing the negative inverse of m mod 2^n.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_2048_mont_reduce_32(
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_2048_mont_reduce_32(
     sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_2048_mont_reduce_32(
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_2048_mont_reduce_32(
     sp_digit* a, const sp_digit* m, sp_digit mp)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -3976,8 +4106,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_2048_mont_reduce_32(
         "STR	r4, [%[a]]\n\t"
         "STR	r5, [%[a], #4]\n\t"
         "MOV	%[mp], r3\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
         :
+#else
+        :
+        : [a] "r" (a), [m] "r" (m), [mp] "r" (mp)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11", "r12", "lr"
     );
@@ -3992,10 +4127,10 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_2048_mont_reduce_32(
  * mp  The digit representing the negative inverse of m mod 2^n.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_2048_mont_reduce_32(
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_2048_mont_reduce_32(
     sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_2048_mont_reduce_32(
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_2048_mont_reduce_32(
     sp_digit* a, const sp_digit* m, sp_digit mp)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -4097,8 +4232,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_2048_mont_reduce_32(
 #endif
         /* Loop Done */
         "MOV	%[mp], r3\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
         :
+#else
+        :
+        : [a] "r" (a), [m] "r" (m), [mp] "r" (mp)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11", "r12"
     );
@@ -4115,10 +4255,10 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_2048_mont_reduce_32(
  * mp  The digit representing the negative inverse of m mod 2^n.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_2048_mont_reduce_32(
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_2048_mont_reduce_32(
     sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_2048_mont_reduce_32(
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_2048_mont_reduce_32(
     sp_digit* a, const sp_digit* m, sp_digit mp)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -4323,8 +4463,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_2048_mont_reduce_32(
         "STR	r9, [%[a], #12]\n\t"
         "STR	r10, [%[a], #16]\n\t"
         "MOV	%[mp], r5\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
         :
+#else
+        :
+        : [a] "r" (a), [m] "r" (m), [mp] "r" (mp)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11", "r12", "lr"
     );
@@ -4339,10 +4484,10 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_2048_mont_reduce_32(
  * mp  The digit representing the negative inverse of m mod 2^n.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_2048_mont_reduce_32(
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_2048_mont_reduce_32(
     sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_2048_mont_reduce_32(
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_2048_mont_reduce_32(
     sp_digit* a, const sp_digit* m, sp_digit mp)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -4432,8 +4577,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_2048_mont_reduce_32(
 #endif
         /* Loop Done */
         "MOV	%[mp], r3\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
         :
+#else
+        :
+        : [a] "r" (a), [m] "r" (m), [mp] "r" (mp)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11", "r12"
     );
@@ -4527,8 +4677,13 @@ WC_OMIT_FRAME_POINTER static void sp_2048_mul_d_32(sp_digit* r,
         "BLT.N	L_sp_2048_mul_d_32_word_%=\n\t"
 #endif
         "STR	r3, [%[r], #128]\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9"
     );
 }
@@ -4715,8 +4870,13 @@ WC_OMIT_FRAME_POINTER static void sp_2048_mul_d_32(sp_digit* r,
         "UMLAL	r4, r5, %[b], r8\n\t"
         "STM	%[r]!, {r4}\n\t"
         "STR	r5, [%[r]]\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8"
     );
 }
@@ -4733,10 +4893,10 @@ WC_OMIT_FRAME_POINTER static void sp_2048_mul_d_32(sp_digit* r,
  * Note that this is an approximate div. It may give an answer 1 larger.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static sp_digit div_2048_word_32(
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit div_2048_word_32(
     sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static sp_digit div_2048_word_32(sp_digit d1,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit div_2048_word_32(sp_digit d1,
     sp_digit d0, sp_digit div)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -4782,8 +4942,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static sp_digit div_2048_word_32(sp_digit d1,
         "SUB	%[d0], %[d0], r3\n\t"
         "UDIV	r3, %[d0], %[div]\n\t"
         "ADD	%[d1], r6, r3\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [d1] "+r" (d1), [d0] "+r" (d0), [div] "+r" (div)
         :
+#else
+        :
+        : [d1] "r" (d1), [d0] "r" (d0), [div] "r" (div)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (word32)(size_t)d1;
@@ -4800,10 +4965,10 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static sp_digit div_2048_word_32(sp_digit d1,
  * Note that this is an approximate div. It may give an answer 1 larger.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static sp_digit div_2048_word_32(
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit div_2048_word_32(
     sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static sp_digit div_2048_word_32(sp_digit d1,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit div_2048_word_32(sp_digit d1,
     sp_digit d0, sp_digit div)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -4866,8 +5031,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static sp_digit div_2048_word_32(sp_digit d1,
         "SUBS	r8, %[div], r9\n\t"
         "SBC	r8, r8, r8\n\t"
         "SUB	%[d1], r3, r8\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [d1] "+r" (d1), [d0] "+r" (d0), [div] "+r" (div)
         :
+#else
+        :
+        : [d1] "r" (d1), [d0] "r" (d0), [div] "r" (div)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (word32)(size_t)d1;
@@ -5281,8 +5451,13 @@ WC_OMIT_FRAME_POINTER static sp_int32 sp_2048_cmp_32(const sp_digit* a,
         "EOR	r2, r2, r3\n\t"
 #endif /*WOLFSSL_SP_SMALL */
         "MOV	%[a], r2\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8"
     );
     return (word32)(size_t)a;
@@ -5705,8 +5880,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_2048_cond_sub_64(sp_digit* r,
         "BLT.N	L_sp_2048_cond_sub_64_words_%=\n\t"
 #endif
         "MOV	%[r], r4\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8"
     );
     return (word32)(size_t)r;
@@ -5963,8 +6143,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_2048_cond_sub_64(sp_digit* r,
         "SBCS	r7, r7, r9\n\t"
         "STM	%[r]!, {r6, r7}\n\t"
         "SBC	%[r], r5, r5\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9"
     );
     return (word32)(size_t)r;
@@ -5980,10 +6165,10 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_2048_cond_sub_64(sp_digit* r,
  * mp  The digit representing the negative inverse of m mod 2^n.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_2048_mont_reduce_64(
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_2048_mont_reduce_64(
     sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_2048_mont_reduce_64(
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_2048_mont_reduce_64(
     sp_digit* a, const sp_digit* m, sp_digit mp)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -6534,8 +6719,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_2048_mont_reduce_64(
         "STR	r4, [%[a]]\n\t"
         "STR	r5, [%[a], #4]\n\t"
         "MOV	%[mp], r3\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
         :
+#else
+        :
+        : [a] "r" (a), [m] "r" (m), [mp] "r" (mp)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11", "r12", "lr"
     );
@@ -6550,10 +6740,10 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_2048_mont_reduce_64(
  * mp  The digit representing the negative inverse of m mod 2^n.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_2048_mont_reduce_64(
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_2048_mont_reduce_64(
     sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_2048_mont_reduce_64(
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_2048_mont_reduce_64(
     sp_digit* a, const sp_digit* m, sp_digit mp)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -6655,8 +6845,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_2048_mont_reduce_64(
 #endif
         /* Loop Done */
         "MOV	%[mp], r3\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
         :
+#else
+        :
+        : [a] "r" (a), [m] "r" (m), [mp] "r" (mp)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11", "r12"
     );
@@ -6673,10 +6868,10 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_2048_mont_reduce_64(
  * mp  The digit representing the negative inverse of m mod 2^n.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_2048_mont_reduce_64(
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_2048_mont_reduce_64(
     sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_2048_mont_reduce_64(
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_2048_mont_reduce_64(
     sp_digit* a, const sp_digit* m, sp_digit mp)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -7041,8 +7236,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_2048_mont_reduce_64(
         "STR	r9, [%[a], #12]\n\t"
         "STR	r10, [%[a], #16]\n\t"
         "MOV	%[mp], r5\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
         :
+#else
+        :
+        : [a] "r" (a), [m] "r" (m), [mp] "r" (mp)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11", "r12", "lr"
     );
@@ -7057,10 +7257,10 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_2048_mont_reduce_64(
  * mp  The digit representing the negative inverse of m mod 2^n.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_2048_mont_reduce_64(
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_2048_mont_reduce_64(
     sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_2048_mont_reduce_64(
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_2048_mont_reduce_64(
     sp_digit* a, const sp_digit* m, sp_digit mp)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -7150,8 +7350,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_2048_mont_reduce_64(
 #endif
         /* Loop Done */
         "MOV	%[mp], r3\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
         :
+#else
+        :
+        : [a] "r" (a), [m] "r" (m), [mp] "r" (mp)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11", "r12"
     );
@@ -7238,8 +7443,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_2048_sub_64(sp_digit* r,
         "BNE.N	L_sp_2048_sub_64_word_%=\n\t"
 #endif
         "MOV	%[r], r11\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11", "r12"
     );
@@ -7381,8 +7591,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_2048_sub_64(sp_digit* r,
         "SBCS	r6, r6, r10\n\t"
         "STM	%[r]!, {r3, r4, r5, r6}\n\t"
         "SBC	%[r], r6, r6\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (word32)(size_t)r;
@@ -7400,10 +7615,10 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_2048_sub_64(sp_digit* r,
  * Note that this is an approximate div. It may give an answer 1 larger.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static sp_digit div_2048_word_64(
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit div_2048_word_64(
     sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static sp_digit div_2048_word_64(sp_digit d1,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit div_2048_word_64(sp_digit d1,
     sp_digit d0, sp_digit div)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -7449,8 +7664,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static sp_digit div_2048_word_64(sp_digit d1,
         "SUB	%[d0], %[d0], r3\n\t"
         "UDIV	r3, %[d0], %[div]\n\t"
         "ADD	%[d1], r6, r3\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [d1] "+r" (d1), [d0] "+r" (d0), [div] "+r" (div)
         :
+#else
+        :
+        : [d1] "r" (d1), [d0] "r" (d0), [div] "r" (div)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (word32)(size_t)d1;
@@ -7467,10 +7687,10 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static sp_digit div_2048_word_64(sp_digit d1,
  * Note that this is an approximate div. It may give an answer 1 larger.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static sp_digit div_2048_word_64(
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit div_2048_word_64(
     sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static sp_digit div_2048_word_64(sp_digit d1,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit div_2048_word_64(sp_digit d1,
     sp_digit d0, sp_digit div)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -7533,8 +7753,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static sp_digit div_2048_word_64(sp_digit d1,
         "SUBS	r8, %[div], r9\n\t"
         "SBC	r8, r8, r8\n\t"
         "SUB	%[d1], r3, r8\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [d1] "+r" (d1), [d0] "+r" (d0), [div] "+r" (div)
         :
+#else
+        :
+        : [d1] "r" (d1), [d0] "r" (d0), [div] "r" (div)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (word32)(size_t)d1;
@@ -8403,8 +8628,13 @@ WC_OMIT_FRAME_POINTER static sp_int32 sp_2048_cmp_64(const sp_digit* a,
         "EOR	r2, r2, r3\n\t"
 #endif /*WOLFSSL_SP_SMALL */
         "MOV	%[a], r2\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8"
     );
     return (word32)(size_t)a;
@@ -8936,8 +9166,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_2048_cond_add_32(sp_digit* r,
         "BLT.N	L_sp_2048_cond_add_32_words_%=\n\t"
 #endif
         "MOV	%[r], r5\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8"
     );
     return (word32)(size_t)r;
@@ -9082,8 +9317,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_2048_cond_add_32(sp_digit* r,
         "ADCS	r7, r7, r9\n\t"
         "STM	%[r]!, {r6, r7}\n\t"
         "ADC	%[r], r10, r10\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (word32)(size_t)r;
@@ -9763,8 +10003,13 @@ WC_OMIT_FRAME_POINTER static void sp_2048_lshift_64(sp_digit* r,
         "ORR	r6, r6, r3\n\t"
         "STR	r5, [%[r]]\n\t"
         "STR	r6, [%[r], #4]\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [n] "+r" (n)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [n] "r" (n)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r4", "r5", "r6", "r3", "r7"
     );
 }
@@ -11196,8 +11441,13 @@ WC_OMIT_FRAME_POINTER static void sp_3072_mul_12(sp_digit* r, const sp_digit* a,
         "STM	%[r]!, {r3, r4, r5, r6}\n\t"
         "LDM	sp!, {r3, r4, r5, r6}\n\t"
         "STM	%[r]!, {r3, r4, r5, r6}\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r11",
             "r12"
     );
@@ -11247,8 +11497,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_3072_add_12(sp_digit* r,
         "STM	%[r]!, {r3, r4, r5, r6}\n\t"
         "MOV	%[r], #0x0\n\t"
         "ADC	%[r], %[r], #0x0\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (word32)(size_t)r;
@@ -11316,8 +11571,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_3072_sub_in_place_24(sp_digit* a,
         "SBCS	r5, r5, r9\n\t"
         "STM	%[a]!, {r2, r3, r4, r5}\n\t"
         "SBC	%[a], r9, r9\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9"
     );
     return (word32)(size_t)a;
@@ -11388,8 +11648,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_3072_add_24(sp_digit* r,
         "STM	%[r]!, {r3, r4, r5, r6}\n\t"
         "MOV	%[r], #0x0\n\t"
         "ADC	%[r], %[r], #0x0\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (word32)(size_t)r;
@@ -11568,8 +11833,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_3072_sub_in_place_48(sp_digit* a,
         "SBCS	r5, r5, r9\n\t"
         "STM	%[a]!, {r2, r3, r4, r5}\n\t"
         "SBC	%[a], r9, r9\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9"
     );
     return (word32)(size_t)a;
@@ -11682,8 +11952,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_3072_add_48(sp_digit* r,
         "STM	%[r]!, {r3, r4, r5, r6}\n\t"
         "MOV	%[r], #0x0\n\t"
         "ADC	%[r], %[r], #0x0\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (word32)(size_t)r;
@@ -11946,8 +12221,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_3072_sub_in_place_96(sp_digit* a,
         "SBCS	r5, r5, r9\n\t"
         "STM	%[a]!, {r2, r3, r4, r5}\n\t"
         "SBC	%[a], r9, r9\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9"
     );
     return (word32)(size_t)a;
@@ -12144,8 +12424,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_3072_add_96(sp_digit* r,
         "STM	%[r]!, {r3, r4, r5, r6}\n\t"
         "MOV	%[r], #0x0\n\t"
         "ADC	%[r], %[r], #0x0\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (word32)(size_t)r;
@@ -12915,8 +13200,13 @@ WC_OMIT_FRAME_POINTER static void sp_3072_sqr_12(sp_digit* r, const sp_digit* a)
         "STM	%[r]!, {r2, r3, r4, r8}\n\t"
         "LDM	sp!, {r2, r3, r4, r8}\n\t"
         "STM	%[r]!, {r2, r3, r4, r8}\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r12"
     );
@@ -12965,8 +13255,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_3072_sub_12(sp_digit* r,
         "SBCS	r6, r6, r10\n\t"
         "STM	%[r]!, {r3, r4, r5, r6}\n\t"
         "SBC	%[r], r6, r6\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (word32)(size_t)r;
@@ -13072,8 +13367,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_3072_sub_24(sp_digit* r,
         "SBCS	r6, r6, r10\n\t"
         "STM	%[r]!, {r3, r4, r5, r6}\n\t"
         "SBC	%[r], r6, r6\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (word32)(size_t)r;
@@ -13221,8 +13521,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_3072_sub_48(sp_digit* r,
         "SBCS	r6, r6, r10\n\t"
         "STM	%[r]!, {r3, r4, r5, r6}\n\t"
         "SBC	%[r], r6, r6\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (word32)(size_t)r;
@@ -13314,8 +13619,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_3072_add_96(sp_digit* r,
         "BNE.N	L_sp_3072_add_96_word_%=\n\t"
 #endif
         "MOV	%[r], r3\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
             "r3", "r12"
     );
@@ -13369,8 +13679,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_3072_sub_in_place_96(sp_digit* a,
         "BNE.N	L_sp_3072_sub_in_place_96_word_%=\n\t"
 #endif
         "MOV	%[a], r10\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11"
     );
@@ -13500,8 +13815,13 @@ WC_OMIT_FRAME_POINTER static void sp_3072_mul_96(sp_digit* r, const sp_digit* a,
 #else
         "BGT.N	L_sp_3072_mul_96_store_%=\n\t"
 #endif
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr",
             "r11"
     );
@@ -13619,8 +13939,13 @@ WC_OMIT_FRAME_POINTER static void sp_3072_sqr_96(sp_digit* r, const sp_digit* a)
 #else
         "BGT.N	L_sp_3072_sqr_96_store_%=\n\t"
 #endif
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr",
             "r11"
     );
@@ -13694,8 +14019,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_3072_add_48(sp_digit* r,
         "BNE.N	L_sp_3072_add_48_word_%=\n\t"
 #endif
         "MOV	%[r], r3\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
             "r3", "r12"
     );
@@ -13749,8 +14079,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_3072_sub_in_place_48(sp_digit* a,
         "BNE.N	L_sp_3072_sub_in_place_48_word_%=\n\t"
 #endif
         "MOV	%[a], r10\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11"
     );
@@ -13880,8 +14215,13 @@ WC_OMIT_FRAME_POINTER static void sp_3072_mul_48(sp_digit* r, const sp_digit* a,
 #else
         "BGT.N	L_sp_3072_mul_48_store_%=\n\t"
 #endif
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr",
             "r11"
     );
@@ -13999,8 +14339,13 @@ WC_OMIT_FRAME_POINTER static void sp_3072_sqr_48(sp_digit* r, const sp_digit* a)
 #else
         "BGT.N	L_sp_3072_sqr_48_store_%=\n\t"
 #endif
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr",
             "r11"
     );
@@ -14084,8 +14429,13 @@ WC_OMIT_FRAME_POINTER static void sp_3072_mul_d_96(sp_digit* r,
         "BLT.N	L_sp_3072_mul_d_96_word_%=\n\t"
 #endif
         "STR	r3, [%[r], #384]\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9"
     );
 }
@@ -14592,8 +14942,13 @@ WC_OMIT_FRAME_POINTER static void sp_3072_mul_d_96(sp_digit* r,
         "UMLAL	r5, r3, %[b], r8\n\t"
         "STM	%[r]!, {r5}\n\t"
         "STR	r3, [%[r]]\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8"
     );
 }
@@ -14665,8 +15020,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_3072_cond_sub_48(sp_digit* r,
         "BLT.N	L_sp_3072_cond_sub_48_words_%=\n\t"
 #endif
         "MOV	%[r], r4\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8"
     );
     return (word32)(size_t)r;
@@ -14867,8 +15227,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_3072_cond_sub_48(sp_digit* r,
         "SBCS	r7, r7, r9\n\t"
         "STM	%[r]!, {r6, r7}\n\t"
         "SBC	%[r], r5, r5\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9"
     );
     return (word32)(size_t)r;
@@ -14884,10 +15249,10 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_3072_cond_sub_48(sp_digit* r,
  * mp  The digit representing the negative inverse of m mod 2^n.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_3072_mont_reduce_48(
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_3072_mont_reduce_48(
     sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_3072_mont_reduce_48(
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_3072_mont_reduce_48(
     sp_digit* a, const sp_digit* m, sp_digit mp)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -15310,8 +15675,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_3072_mont_reduce_48(
         "STR	r4, [%[a]]\n\t"
         "STR	r5, [%[a], #4]\n\t"
         "MOV	%[mp], r3\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
         :
+#else
+        :
+        : [a] "r" (a), [m] "r" (m), [mp] "r" (mp)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11", "r12", "lr"
     );
@@ -15326,10 +15696,10 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_3072_mont_reduce_48(
  * mp  The digit representing the negative inverse of m mod 2^n.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_3072_mont_reduce_48(
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_3072_mont_reduce_48(
     sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_3072_mont_reduce_48(
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_3072_mont_reduce_48(
     sp_digit* a, const sp_digit* m, sp_digit mp)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -15431,8 +15801,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_3072_mont_reduce_48(
 #endif
         /* Loop Done */
         "MOV	%[mp], r3\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
         :
+#else
+        :
+        : [a] "r" (a), [m] "r" (m), [mp] "r" (mp)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11", "r12"
     );
@@ -15449,10 +15824,10 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_3072_mont_reduce_48(
  * mp  The digit representing the negative inverse of m mod 2^n.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_3072_mont_reduce_48(
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_3072_mont_reduce_48(
     sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_3072_mont_reduce_48(
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_3072_mont_reduce_48(
     sp_digit* a, const sp_digit* m, sp_digit mp)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -15737,8 +16112,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_3072_mont_reduce_48(
         "STR	r9, [%[a], #12]\n\t"
         "STR	r10, [%[a], #16]\n\t"
         "MOV	%[mp], r5\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
         :
+#else
+        :
+        : [a] "r" (a), [m] "r" (m), [mp] "r" (mp)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11", "r12", "lr"
     );
@@ -15753,10 +16133,10 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_3072_mont_reduce_48(
  * mp  The digit representing the negative inverse of m mod 2^n.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_3072_mont_reduce_48(
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_3072_mont_reduce_48(
     sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_3072_mont_reduce_48(
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_3072_mont_reduce_48(
     sp_digit* a, const sp_digit* m, sp_digit mp)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -15846,8 +16226,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_3072_mont_reduce_48(
 #endif
         /* Loop Done */
         "MOV	%[mp], r3\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
         :
+#else
+        :
+        : [a] "r" (a), [m] "r" (m), [mp] "r" (mp)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11", "r12"
     );
@@ -15941,8 +16326,13 @@ WC_OMIT_FRAME_POINTER static void sp_3072_mul_d_48(sp_digit* r,
         "BLT.N	L_sp_3072_mul_d_48_word_%=\n\t"
 #endif
         "STR	r3, [%[r], #192]\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9"
     );
 }
@@ -16209,8 +16599,13 @@ WC_OMIT_FRAME_POINTER static void sp_3072_mul_d_48(sp_digit* r,
         "UMLAL	r5, r3, %[b], r8\n\t"
         "STM	%[r]!, {r5}\n\t"
         "STR	r3, [%[r]]\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8"
     );
 }
@@ -16227,10 +16622,10 @@ WC_OMIT_FRAME_POINTER static void sp_3072_mul_d_48(sp_digit* r,
  * Note that this is an approximate div. It may give an answer 1 larger.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static sp_digit div_3072_word_48(
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit div_3072_word_48(
     sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static sp_digit div_3072_word_48(sp_digit d1,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit div_3072_word_48(sp_digit d1,
     sp_digit d0, sp_digit div)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -16276,8 +16671,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static sp_digit div_3072_word_48(sp_digit d1,
         "SUB	%[d0], %[d0], r3\n\t"
         "UDIV	r3, %[d0], %[div]\n\t"
         "ADD	%[d1], r6, r3\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [d1] "+r" (d1), [d0] "+r" (d0), [div] "+r" (div)
         :
+#else
+        :
+        : [d1] "r" (d1), [d0] "r" (d0), [div] "r" (div)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (word32)(size_t)d1;
@@ -16294,10 +16694,10 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static sp_digit div_3072_word_48(sp_digit d1,
  * Note that this is an approximate div. It may give an answer 1 larger.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static sp_digit div_3072_word_48(
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit div_3072_word_48(
     sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static sp_digit div_3072_word_48(sp_digit d1,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit div_3072_word_48(sp_digit d1,
     sp_digit d0, sp_digit div)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -16360,8 +16760,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static sp_digit div_3072_word_48(sp_digit d1,
         "SUBS	r8, %[div], r9\n\t"
         "SBC	r8, r8, r8\n\t"
         "SUB	%[d1], r3, r8\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [d1] "+r" (d1), [d0] "+r" (d0), [div] "+r" (div)
         :
+#else
+        :
+        : [d1] "r" (d1), [d0] "r" (d0), [div] "r" (div)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (word32)(size_t)d1;
@@ -16951,8 +17356,13 @@ WC_OMIT_FRAME_POINTER static sp_int32 sp_3072_cmp_48(const sp_digit* a,
         "EOR	r2, r2, r3\n\t"
 #endif /*WOLFSSL_SP_SMALL */
         "MOV	%[a], r2\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8"
     );
     return (word32)(size_t)a;
@@ -17375,8 +17785,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_3072_cond_sub_96(sp_digit* r,
         "BLT.N	L_sp_3072_cond_sub_96_words_%=\n\t"
 #endif
         "MOV	%[r], r4\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8"
     );
     return (word32)(size_t)r;
@@ -17745,8 +18160,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_3072_cond_sub_96(sp_digit* r,
         "SBCS	r7, r7, r9\n\t"
         "STM	%[r]!, {r6, r7}\n\t"
         "SBC	%[r], r5, r5\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9"
     );
     return (word32)(size_t)r;
@@ -17762,10 +18182,10 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_3072_cond_sub_96(sp_digit* r,
  * mp  The digit representing the negative inverse of m mod 2^n.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_3072_mont_reduce_96(
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_3072_mont_reduce_96(
     sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_3072_mont_reduce_96(
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_3072_mont_reduce_96(
     sp_digit* a, const sp_digit* m, sp_digit mp)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -18572,8 +18992,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_3072_mont_reduce_96(
         "STR	r4, [%[a]]\n\t"
         "STR	r5, [%[a], #4]\n\t"
         "MOV	%[mp], r3\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
         :
+#else
+        :
+        : [a] "r" (a), [m] "r" (m), [mp] "r" (mp)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11", "r12", "lr"
     );
@@ -18588,10 +19013,10 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_3072_mont_reduce_96(
  * mp  The digit representing the negative inverse of m mod 2^n.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_3072_mont_reduce_96(
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_3072_mont_reduce_96(
     sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_3072_mont_reduce_96(
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_3072_mont_reduce_96(
     sp_digit* a, const sp_digit* m, sp_digit mp)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -18693,8 +19118,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_3072_mont_reduce_96(
 #endif
         /* Loop Done */
         "MOV	%[mp], r3\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
         :
+#else
+        :
+        : [a] "r" (a), [m] "r" (m), [mp] "r" (mp)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11", "r12"
     );
@@ -18711,10 +19141,10 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_3072_mont_reduce_96(
  * mp  The digit representing the negative inverse of m mod 2^n.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_3072_mont_reduce_96(
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_3072_mont_reduce_96(
     sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_3072_mont_reduce_96(
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_3072_mont_reduce_96(
     sp_digit* a, const sp_digit* m, sp_digit mp)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -19239,8 +19669,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_3072_mont_reduce_96(
         "STR	r9, [%[a], #12]\n\t"
         "STR	r10, [%[a], #16]\n\t"
         "MOV	%[mp], r5\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
         :
+#else
+        :
+        : [a] "r" (a), [m] "r" (m), [mp] "r" (mp)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11", "r12", "lr"
     );
@@ -19255,10 +19690,10 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_3072_mont_reduce_96(
  * mp  The digit representing the negative inverse of m mod 2^n.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_3072_mont_reduce_96(
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_3072_mont_reduce_96(
     sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_3072_mont_reduce_96(
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_3072_mont_reduce_96(
     sp_digit* a, const sp_digit* m, sp_digit mp)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -19348,8 +19783,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_3072_mont_reduce_96(
 #endif
         /* Loop Done */
         "MOV	%[mp], r3\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
         :
+#else
+        :
+        : [a] "r" (a), [m] "r" (m), [mp] "r" (mp)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11", "r12"
     );
@@ -19436,8 +19876,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_3072_sub_96(sp_digit* r,
         "BNE.N	L_sp_3072_sub_96_word_%=\n\t"
 #endif
         "MOV	%[r], r11\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11", "r12"
     );
@@ -19635,8 +20080,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_3072_sub_96(sp_digit* r,
         "SBCS	r6, r6, r10\n\t"
         "STM	%[r]!, {r3, r4, r5, r6}\n\t"
         "SBC	%[r], r6, r6\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (word32)(size_t)r;
@@ -19654,10 +20104,10 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_3072_sub_96(sp_digit* r,
  * Note that this is an approximate div. It may give an answer 1 larger.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static sp_digit div_3072_word_96(
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit div_3072_word_96(
     sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static sp_digit div_3072_word_96(sp_digit d1,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit div_3072_word_96(sp_digit d1,
     sp_digit d0, sp_digit div)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -19703,8 +20153,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static sp_digit div_3072_word_96(sp_digit d1,
         "SUB	%[d0], %[d0], r3\n\t"
         "UDIV	r3, %[d0], %[div]\n\t"
         "ADD	%[d1], r6, r3\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [d1] "+r" (d1), [d0] "+r" (d0), [div] "+r" (div)
         :
+#else
+        :
+        : [d1] "r" (d1), [d0] "r" (d0), [div] "r" (div)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (word32)(size_t)d1;
@@ -19721,10 +20176,10 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static sp_digit div_3072_word_96(sp_digit d1,
  * Note that this is an approximate div. It may give an answer 1 larger.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static sp_digit div_3072_word_96(
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit div_3072_word_96(
     sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static sp_digit div_3072_word_96(sp_digit d1,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit div_3072_word_96(sp_digit d1,
     sp_digit d0, sp_digit div)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -19787,8 +20242,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static sp_digit div_3072_word_96(sp_digit d1,
         "SUBS	r8, %[div], r9\n\t"
         "SBC	r8, r8, r8\n\t"
         "SUB	%[d1], r3, r8\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [d1] "+r" (d1), [d0] "+r" (d0), [div] "+r" (div)
         :
+#else
+        :
+        : [d1] "r" (d1), [d0] "r" (d0), [div] "r" (div)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (word32)(size_t)d1;
@@ -21009,8 +21469,13 @@ WC_OMIT_FRAME_POINTER static sp_int32 sp_3072_cmp_96(const sp_digit* a,
         "EOR	r2, r2, r3\n\t"
 #endif /*WOLFSSL_SP_SMALL */
         "MOV	%[a], r2\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8"
     );
     return (word32)(size_t)a;
@@ -21542,8 +22007,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_3072_cond_add_48(sp_digit* r,
         "BLT.N	L_sp_3072_cond_add_48_words_%=\n\t"
 #endif
         "MOV	%[r], r5\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8"
     );
     return (word32)(size_t)r;
@@ -21744,8 +22214,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_3072_cond_add_48(sp_digit* r,
         "ADCS	r7, r7, r9\n\t"
         "STM	%[r]!, {r6, r7}\n\t"
         "ADC	%[r], r10, r10\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (word32)(size_t)r;
@@ -22617,8 +23092,13 @@ WC_OMIT_FRAME_POINTER static void sp_3072_lshift_96(sp_digit* r,
         "ORR	r4, r4, r3\n\t"
         "STR	r6, [%[r]]\n\t"
         "STR	r4, [%[r], #4]\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [n] "+r" (n)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [n] "r" (n)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r4", "r5", "r6", "r3", "r7"
     );
 }
@@ -23271,8 +23751,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_4096_sub_in_place_128(sp_digit* a,
         "SBCS	r5, r5, r9\n\t"
         "STM	%[a]!, {r2, r3, r4, r5}\n\t"
         "SBC	%[a], r9, r9\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9"
     );
     return (word32)(size_t)a;
@@ -23525,8 +24010,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_4096_add_128(sp_digit* r,
         "STM	%[r]!, {r3, r4, r5, r6}\n\t"
         "MOV	%[r], #0x0\n\t"
         "ADC	%[r], %[r], #0x0\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (word32)(size_t)r;
@@ -23657,8 +24147,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_4096_add_128(sp_digit* r,
         "BNE.N	L_sp_4096_add_128_word_%=\n\t"
 #endif
         "MOV	%[r], r3\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
             "r3", "r12"
     );
@@ -23712,8 +24207,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_4096_sub_in_place_128(sp_digit* a,
         "BNE.N	L_sp_4096_sub_in_place_128_word_%=\n\t"
 #endif
         "MOV	%[a], r10\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11"
     );
@@ -23843,8 +24343,13 @@ WC_OMIT_FRAME_POINTER static void sp_4096_mul_128(sp_digit* r,
 #else
         "BGT.N	L_sp_4096_mul_128_store_%=\n\t"
 #endif
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr",
             "r11"
     );
@@ -23963,8 +24468,13 @@ WC_OMIT_FRAME_POINTER static void sp_4096_sqr_128(sp_digit* r,
 #else
         "BGT.N	L_sp_4096_sqr_128_store_%=\n\t"
 #endif
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr",
             "r11"
     );
@@ -24046,8 +24556,13 @@ WC_OMIT_FRAME_POINTER static void sp_4096_mul_d_128(sp_digit* r,
         "BLT.N	L_sp_4096_mul_d_128_word_%=\n\t"
 #endif
         "STR	r3, [%[r], #512]\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9"
     );
 }
@@ -24714,8 +25229,13 @@ WC_OMIT_FRAME_POINTER static void sp_4096_mul_d_128(sp_digit* r,
         "UMLAL	r4, r5, %[b], r8\n\t"
         "STM	%[r]!, {r4}\n\t"
         "STR	r5, [%[r]]\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8"
     );
 }
@@ -24788,8 +25308,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_4096_cond_sub_128(sp_digit* r,
         "BLT.N	L_sp_4096_cond_sub_128_words_%=\n\t"
 #endif
         "MOV	%[r], r4\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8"
     );
     return (word32)(size_t)r;
@@ -25270,8 +25795,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_4096_cond_sub_128(sp_digit* r,
         "SBCS	r7, r7, r9\n\t"
         "STM	%[r]!, {r6, r7}\n\t"
         "SBC	%[r], r5, r5\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9"
     );
     return (word32)(size_t)r;
@@ -25287,10 +25817,10 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_4096_cond_sub_128(sp_digit* r,
  * mp  The digit representing the negative inverse of m mod 2^n.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_4096_mont_reduce_128(
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_4096_mont_reduce_128(
     sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_4096_mont_reduce_128(
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_4096_mont_reduce_128(
     sp_digit* a, const sp_digit* m, sp_digit mp)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -26353,8 +26883,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_4096_mont_reduce_128(
         "STR	r4, [%[a]]\n\t"
         "STR	r5, [%[a], #4]\n\t"
         "MOV	%[mp], r3\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
         :
+#else
+        :
+        : [a] "r" (a), [m] "r" (m), [mp] "r" (mp)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11", "r12", "lr"
     );
@@ -26369,10 +26904,10 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_4096_mont_reduce_128(
  * mp  The digit representing the negative inverse of m mod 2^n.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_4096_mont_reduce_128(
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_4096_mont_reduce_128(
     sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_4096_mont_reduce_128(
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_4096_mont_reduce_128(
     sp_digit* a, const sp_digit* m, sp_digit mp)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -26474,8 +27009,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_4096_mont_reduce_128(
 #endif
         /* Loop Done */
         "MOV	%[mp], r3\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
         :
+#else
+        :
+        : [a] "r" (a), [m] "r" (m), [mp] "r" (mp)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11", "r12"
     );
@@ -26492,10 +27032,10 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_4096_mont_reduce_128(
  * mp  The digit representing the negative inverse of m mod 2^n.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_4096_mont_reduce_128(
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_4096_mont_reduce_128(
     sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_4096_mont_reduce_128(
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_4096_mont_reduce_128(
     sp_digit* a, const sp_digit* m, sp_digit mp)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -27180,8 +27720,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_4096_mont_reduce_128(
         "STR	r9, [%[a], #12]\n\t"
         "STR	r10, [%[a], #16]\n\t"
         "MOV	%[mp], r5\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
         :
+#else
+        :
+        : [a] "r" (a), [m] "r" (m), [mp] "r" (mp)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11", "r12", "lr"
     );
@@ -27196,10 +27741,10 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_4096_mont_reduce_128(
  * mp  The digit representing the negative inverse of m mod 2^n.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_4096_mont_reduce_128(
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_4096_mont_reduce_128(
     sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_4096_mont_reduce_128(
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_4096_mont_reduce_128(
     sp_digit* a, const sp_digit* m, sp_digit mp)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -27289,8 +27834,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_4096_mont_reduce_128(
 #endif
         /* Loop Done */
         "MOV	%[mp], r3\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
         :
+#else
+        :
+        : [a] "r" (a), [m] "r" (m), [mp] "r" (mp)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11", "r12"
     );
@@ -27377,8 +27927,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_4096_sub_128(sp_digit* r,
         "BNE.N	L_sp_4096_sub_128_word_%=\n\t"
 #endif
         "MOV	%[r], r11\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11", "r12"
     );
@@ -27632,8 +28187,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_4096_sub_128(sp_digit* r,
         "SBCS	r6, r6, r10\n\t"
         "STM	%[r]!, {r3, r4, r5, r6}\n\t"
         "SBC	%[r], r6, r6\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (word32)(size_t)r;
@@ -27651,10 +28211,10 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_4096_sub_128(sp_digit* r,
  * Note that this is an approximate div. It may give an answer 1 larger.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static sp_digit div_4096_word_128(
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit div_4096_word_128(
     sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static sp_digit div_4096_word_128(sp_digit d1,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit div_4096_word_128(sp_digit d1,
     sp_digit d0, sp_digit div)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -27700,8 +28260,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static sp_digit div_4096_word_128(sp_digit d1,
         "SUB	%[d0], %[d0], r3\n\t"
         "UDIV	r3, %[d0], %[div]\n\t"
         "ADD	%[d1], r6, r3\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [d1] "+r" (d1), [d0] "+r" (d0), [div] "+r" (div)
         :
+#else
+        :
+        : [d1] "r" (d1), [d0] "r" (d0), [div] "r" (div)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (word32)(size_t)d1;
@@ -27718,10 +28283,10 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static sp_digit div_4096_word_128(sp_digit d1,
  * Note that this is an approximate div. It may give an answer 1 larger.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static sp_digit div_4096_word_128(
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit div_4096_word_128(
     sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static sp_digit div_4096_word_128(sp_digit d1,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit div_4096_word_128(sp_digit d1,
     sp_digit d0, sp_digit div)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -27784,8 +28349,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static sp_digit div_4096_word_128(sp_digit d1,
         "SUBS	r8, %[div], r9\n\t"
         "SBC	r8, r8, r8\n\t"
         "SUB	%[d1], r3, r8\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [d1] "+r" (d1), [d0] "+r" (d0), [div] "+r" (div)
         :
+#else
+        :
+        : [d1] "r" (d1), [d0] "r" (d0), [div] "r" (div)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (word32)(size_t)d1;
@@ -29358,8 +29928,13 @@ WC_OMIT_FRAME_POINTER static sp_int32 sp_4096_cmp_128(const sp_digit* a,
         "EOR	r2, r2, r3\n\t"
 #endif /*WOLFSSL_SP_SMALL */
         "MOV	%[a], r2\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8"
     );
     return (word32)(size_t)a;
@@ -29891,8 +30466,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_4096_cond_add_64(sp_digit* r,
         "BLT.N	L_sp_4096_cond_add_64_words_%=\n\t"
 #endif
         "MOV	%[r], r5\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8"
     );
     return (word32)(size_t)r;
@@ -30149,8 +30729,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_4096_cond_add_64(sp_digit* r,
         "ADCS	r7, r7, r9\n\t"
         "STM	%[r]!, {r6, r7}\n\t"
         "ADC	%[r], r10, r10\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (word32)(size_t)r;
@@ -31214,8 +31799,13 @@ WC_OMIT_FRAME_POINTER static void sp_4096_lshift_128(sp_digit* r,
         "ORR	r5, r5, r3\n\t"
         "STR	r4, [%[r]]\n\t"
         "STR	r5, [%[r], #4]\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [n] "+r" (n)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [n] "r" (n)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r4", "r5", "r6", "r3", "r7"
     );
 }
@@ -31612,8 +32202,13 @@ WC_OMIT_FRAME_POINTER static void sp_256_mul_8(sp_digit* r, const sp_digit* a,
 #else
         "BGT.N	L_sp_256_mul_8_store_%=\n\t"
 #endif
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr",
             "r11"
     );
@@ -31628,10 +32223,10 @@ WC_OMIT_FRAME_POINTER static void sp_256_mul_8(sp_digit* r, const sp_digit* a,
  * b  A single precision integer.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_256_mul_8(sp_digit* r_p,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_mul_8(sp_digit* r_p,
     const sp_digit* a_p, const sp_digit* b_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_256_mul_8(sp_digit* r,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_mul_8(sp_digit* r,
     const sp_digit* a, const sp_digit* b)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -31975,8 +32570,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_256_mul_8(sp_digit* r,
         "SUB	%[r], %[r], #0x20\n\t"
         "STM	%[r], {r3, r4, r5, r6, r7, r8, r9, r10}\n\t"
         "ADD	sp, sp, #0x24\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11", "r12", "lr"
     );
@@ -31990,10 +32590,10 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_256_mul_8(sp_digit* r,
  * b  A single precision integer.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_256_mul_8(sp_digit* r_p,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_mul_8(sp_digit* r_p,
     const sp_digit* a_p, const sp_digit* b_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_256_mul_8(sp_digit* r,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_mul_8(sp_digit* r,
     const sp_digit* a, const sp_digit* b)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -32115,8 +32715,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_256_mul_8(sp_digit* r,
         "LDM	sp, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t"
         "STM	lr, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t"
         "ADD	sp, sp, #0x2c\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r10", "r11", "r12", "r7",
             "r8", "r9", "lr"
     );
@@ -32237,8 +32842,13 @@ WC_OMIT_FRAME_POINTER static void sp_256_sqr_8(sp_digit* r, const sp_digit* a)
 #else
         "BGT.N	L_sp_256_sqr_8_store_%=\n\t"
 #endif
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr",
             "r11"
     );
@@ -32252,10 +32862,10 @@ WC_OMIT_FRAME_POINTER static void sp_256_sqr_8(sp_digit* r, const sp_digit* a)
  * a  A single precision integer.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_256_sqr_8(sp_digit* r_p,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_sqr_8(sp_digit* r_p,
     const sp_digit* a_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_256_sqr_8(sp_digit* r,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_sqr_8(sp_digit* r,
     const sp_digit* a)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -32490,8 +33100,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_256_sqr_8(sp_digit* r,
         "SUB	%[r], %[r], #0x20\n\t"
         "STM	%[r], {r3, r4, r5, r6, r7, r8, r9, r10}\n\t"
         "ADD	sp, sp, #0x44\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11", "r12", "lr"
     );
@@ -32504,10 +33119,10 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_256_sqr_8(sp_digit* r,
  * a  A single precision integer.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_256_sqr_8(sp_digit* r_p,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_sqr_8(sp_digit* r_p,
     const sp_digit* a_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_256_sqr_8(sp_digit* r,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_sqr_8(sp_digit* r,
     const sp_digit* a)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -32610,8 +33225,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_256_sqr_8(sp_digit* r,
         "LDM	sp, {r0, r1, r2, r3, r4, r5, r6}\n\t"
         "STM	lr, {r0, r1, r2, r3, r4, r5, r6}\n\t"
         "ADD	sp, sp, #0x20\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11", "r12", "lr"
     );
@@ -32668,8 +33288,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_256_add_8(sp_digit* r,
         "BNE.N	L_sp_256_add_8_word_%=\n\t"
 #endif
         "MOV	%[r], r3\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
             "r3", "r12"
     );
@@ -32714,8 +33339,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_256_add_8(sp_digit* r,
         "STM	%[r]!, {r3, r4, r5, r6}\n\t"
         "MOV	%[r], #0x0\n\t"
         "ADC	%[r], %[r], #0x0\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (word32)(size_t)r;
@@ -32958,8 +33588,13 @@ WC_OMIT_FRAME_POINTER static int sp_256_mod_mul_norm_8(sp_digit* r,
         "STM	%[r], {r2, r3, r4, r5, r6, r7, r8, r11}\n\t"
         "MOV	%[r], #0x0\n\t"
         "ADD	sp, sp, #0x18\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11", "r12"
     );
@@ -33177,11 +33812,11 @@ static int sp_256_point_to_ecc_point_8(const sp_point_256* p, ecc_point* pm)
  * mp  Montgomery multiplier.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r_p,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p,
     const sp_digit* a_p, const sp_digit* b_p, const sp_digit* m_p,
     sp_digit mp_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r,
     const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -33644,8 +34279,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r,
         "LDR	%[r], [sp, #64]\n\t"
         "STM	%[r], {r1, r2, r3, r4, r5, r6, r7, r8}\n\t"
         "ADD	sp, sp, #0x44\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11", "r12", "lr"
     );
@@ -33672,11 +34312,11 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r,
  * mp  Montgomery multiplier.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r_p,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p,
     const sp_digit* a_p, const sp_digit* b_p, const sp_digit* m_p,
     sp_digit mp_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r,
     const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -33917,8 +34557,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r,
         "LDR	%[r], [sp, #68]\n\t"
         "STM	%[r], {r1, r2, r3, r4, r5, r6, r7, r8}\n\t"
         "ADD	sp, sp, #0x4c\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r10", "r11", "r12", "r7",
             "r8", "r9", "lr"
     );
@@ -33944,10 +34589,10 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r,
  * mp  Montgomery multiplier.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r_p,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r_p,
     const sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r,
     const sp_digit* a, const sp_digit* m, sp_digit mp)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -34301,8 +34946,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r,
         "LDR	%[r], [sp, #64]\n\t"
         "STM	%[r], {r1, r2, r3, r4, r5, r6, r7, r8}\n\t"
         "ADD	sp, sp, #0x44\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11", "r12", "lr"
     );
@@ -34327,10 +34977,10 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r,
  * mp  Montgomery multiplier.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r_p,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r_p,
     const sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r,
     const sp_digit* a, const sp_digit* m, sp_digit mp)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -34553,8 +35203,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r,
         "LDR	%[r], [sp, #64]\n\t"
         "STM	%[r], {r1, r2, r3, r4, r5, r6, r7, r8}\n\t"
         "ADD	sp, sp, #0x44\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11", "r12", "lr"
     );
@@ -34810,8 +35465,13 @@ WC_OMIT_FRAME_POINTER static sp_int32 sp_256_cmp_8(const sp_digit* a,
         "EOR	r2, r2, r3\n\t"
 #endif /*WOLFSSL_SP_SMALL */
         "MOV	%[a], r2\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8"
     );
     return (word32)(size_t)a;
@@ -34874,8 +35534,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_256_cond_sub_8(sp_digit* r,
         "BLT.N	L_sp_256_cond_sub_8_words_%=\n\t"
 #endif
         "MOV	%[r], r4\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8"
     );
     return (word32)(size_t)r;
@@ -34936,8 +35601,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_256_cond_sub_8(sp_digit* r,
         "SBCS	r7, r7, r9\n\t"
         "STM	%[r]!, {r6, r7}\n\t"
         "SBC	%[r], r5, r5\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9"
     );
     return (word32)(size_t)r;
@@ -34955,10 +35625,10 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_256_cond_sub_8(sp_digit* r,
  * mp  The digit representing the negative inverse of m mod 2^n.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_256_mont_reduce_8(
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_mont_reduce_8(
     sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_256_mont_reduce_8(sp_digit* a,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_mont_reduce_8(sp_digit* a,
     const sp_digit* m, sp_digit mp)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -35061,8 +35731,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_256_mont_reduce_8(sp_digit* a,
         "STR	r4, [%[a]]\n\t"
         "STR	r5, [%[a], #4]\n\t"
         "MOV	%[mp], r3\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
         :
+#else
+        :
+        : [a] "r" (a), [m] "r" (m), [mp] "r" (mp)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11", "r12", "lr"
     );
@@ -35077,10 +35752,10 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_256_mont_reduce_8(sp_digit* a,
  * mp  The digit representing the negative inverse of m mod 2^n.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_256_mont_reduce_8(
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_mont_reduce_8(
     sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_256_mont_reduce_8(sp_digit* a,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_mont_reduce_8(sp_digit* a,
     const sp_digit* m, sp_digit mp)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -35165,8 +35840,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_256_mont_reduce_8(sp_digit* a,
         "STR	r9, [%[a], #12]\n\t"
         "STR	r10, [%[a], #16]\n\t"
         "MOV	%[mp], r5\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
         :
+#else
+        :
+        : [a] "r" (a), [m] "r" (m), [mp] "r" (mp)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11", "r12", "lr"
     );
@@ -35182,10 +35862,10 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_256_mont_reduce_8(sp_digit* a,
  * mp  The digit representing the negative inverse of m mod 2^n.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_256_mont_reduce_8(
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_mont_reduce_8(
     sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_256_mont_reduce_8(sp_digit* a,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_mont_reduce_8(sp_digit* a,
     const sp_digit* m, sp_digit mp)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -35325,8 +36005,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_256_mont_reduce_8(sp_digit* a,
         "LDR	%[a], [sp, #64]\n\t"
         "STM	%[a], {r1, r2, r3, r4, r5, r6, r7, r8}\n\t"
         "ADD	sp, sp, #0x44\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a)
         :
+#else
+        :
+        : [a] "r" (a)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9",
             "r10", "r11", "r12", "lr"
     );
@@ -35350,10 +36035,10 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_256_mont_reduce_8(sp_digit* a,
  * mp  The digit representing the negative inverse of m mod 2^n.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_256_mont_reduce_order_8(
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_mont_reduce_order_8(
     sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_256_mont_reduce_order_8(
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_mont_reduce_order_8(
     sp_digit* a, const sp_digit* m, sp_digit mp)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -35456,8 +36141,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_256_mont_reduce_order_8(
         "STR	r4, [%[a]]\n\t"
         "STR	r5, [%[a], #4]\n\t"
         "MOV	%[mp], r3\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
         :
+#else
+        :
+        : [a] "r" (a), [m] "r" (m), [mp] "r" (mp)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11", "r12", "lr"
     );
@@ -35472,10 +36162,10 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_256_mont_reduce_order_8(
  * mp  The digit representing the negative inverse of m mod 2^n.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_256_mont_reduce_order_8(
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_mont_reduce_order_8(
     sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_256_mont_reduce_order_8(
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_mont_reduce_order_8(
     sp_digit* a, const sp_digit* m, sp_digit mp)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -35560,8 +36250,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_256_mont_reduce_order_8(
         "STR	r9, [%[a], #12]\n\t"
         "STR	r10, [%[a], #16]\n\t"
         "MOV	%[mp], r5\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
         :
+#else
+        :
+        : [a] "r" (a), [m] "r" (m), [mp] "r" (mp)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11", "r12", "lr"
     );
@@ -35618,10 +36313,10 @@ static void sp_256_map_8(sp_point_256* r, const sp_point_256* p,
  * m   Modulus (prime).
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_256_mont_add_8(sp_digit* r_p,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_mont_add_8(sp_digit* r_p,
     const sp_digit* a_p, const sp_digit* b_p, const sp_digit* m_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_256_mont_add_8(sp_digit* r,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_mont_add_8(sp_digit* r,
     const sp_digit* a, const sp_digit* b, const sp_digit* m)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -35667,8 +36362,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_256_mont_add_8(sp_digit* r,
         "SBCS	r11, r11, lr, LSR #31\n\t"
         "SBC	r12, r12, lr\n\t"
         "STM	%[r], {r5, r6, r7, r8, r9, r10, r11, r12}\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11", "r12", "lr"
     );
@@ -35686,10 +36386,10 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_256_mont_add_8(sp_digit* r,
  * m   Modulus (prime).
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_256_mont_dbl_8(sp_digit* r_p,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_mont_dbl_8(sp_digit* r_p,
     const sp_digit* a_p, const sp_digit* m_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_256_mont_dbl_8(sp_digit* r,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_mont_dbl_8(sp_digit* r,
     const sp_digit* a, const sp_digit* m)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -35730,8 +36430,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_256_mont_dbl_8(sp_digit* r,
         "SBCS	r10, r10, r2, LSR #31\n\t"
         "SBC	r11, r11, r2\n\t"
         "STM	%[r], {r4, r5, r6, r7, r8, r9, r10, r11}\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
             "r2"
     );
@@ -35749,10 +36454,10 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_256_mont_dbl_8(sp_digit* r,
  * m   Modulus (prime).
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_256_mont_tpl_8(sp_digit* r_p,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_mont_tpl_8(sp_digit* r_p,
     const sp_digit* a_p, const sp_digit* m_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_256_mont_tpl_8(sp_digit* r,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_mont_tpl_8(sp_digit* r,
     const sp_digit* a, const sp_digit* m)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -35825,8 +36530,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_256_mont_tpl_8(sp_digit* r,
         "SBCS	r10, r10, r12, LSR #31\n\t"
         "SBC	r11, r11, r12\n\t"
         "STM	%[r], {r4, r5, r6, r7, r8, r9, r10, r11}\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
             "r2", "r3", "r12"
     );
@@ -35845,10 +36555,10 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_256_mont_tpl_8(sp_digit* r,
  * m   Modulus (prime).
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_256_mont_sub_8(sp_digit* r_p,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_mont_sub_8(sp_digit* r_p,
     const sp_digit* a_p, const sp_digit* b_p, const sp_digit* m_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_256_mont_sub_8(sp_digit* r,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_mont_sub_8(sp_digit* r,
     const sp_digit* a, const sp_digit* b, const sp_digit* m)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -35893,8 +36603,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_256_mont_sub_8(sp_digit* r,
         "ADCS	r11, r11, lr, LSR #31\n\t"
         "ADC	r12, r12, lr\n\t"
         "STM	%[r], {r5, r6, r7, r8, r9, r10, r11, r12}\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
             "r12", "lr"
     );
@@ -35907,10 +36622,10 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_256_mont_sub_8(sp_digit* r,
  * m  Modulus (prime).
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_256_mont_div2_8(sp_digit* r_p,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_mont_div2_8(sp_digit* r_p,
     const sp_digit* a_p, const sp_digit* m_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_256_mont_div2_8(sp_digit* r,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_256_mont_div2_8(sp_digit* r,
     const sp_digit* a, const sp_digit* m)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -35958,8 +36673,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_256_mont_div2_8(sp_digit* r,
         "ORR	r10, r10, r7, LSL #31\n\t"
         "ORR	r11, r11, r3, LSL #31\n\t"
         "STM	%[r], {r8, r9, r10, r11}\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [m] "+r" (m)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [m] "r" (m)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
             "r3"
     );
@@ -39291,8 +40011,13 @@ WC_OMIT_FRAME_POINTER static void sp_256_add_one_8(sp_digit* a)
         "ADCS	r3, r3, #0x0\n\t"
         "ADCS	r4, r4, #0x0\n\t"
         "STM	%[a]!, {r1, r2, r3, r4}\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a)
         :
+#else
+        :
+        : [a] "r" (a)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r1", "r2", "r3", "r4"
     );
 }
@@ -39686,8 +40411,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_256_sub_in_place_8(sp_digit* a,
         "BNE.N	L_sp_256_sub_in_place_8_word_%=\n\t"
 #endif
         "MOV	%[a], r10\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11"
     );
@@ -39729,8 +40459,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_256_sub_in_place_8(sp_digit* a,
         "SBCS	r5, r5, r9\n\t"
         "STM	%[a]!, {r2, r3, r4, r5}\n\t"
         "SBC	%[a], r9, r9\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9"
     );
     return (word32)(size_t)a;
@@ -39792,8 +40527,13 @@ WC_OMIT_FRAME_POINTER static void sp_256_mul_d_8(sp_digit* r, const sp_digit* a,
         "BLT.N	L_sp_256_mul_d_8_word_%=\n\t"
 #endif
         "STR	r3, [%[r], #32]\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9"
     );
 }
@@ -39860,8 +40600,13 @@ WC_OMIT_FRAME_POINTER static void sp_256_mul_d_8(sp_digit* r, const sp_digit* a,
         "UMLAL	r4, r5, %[b], r8\n\t"
         "STM	%[r]!, {r4}\n\t"
         "STR	r5, [%[r]]\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8"
     );
 }
@@ -39878,10 +40623,10 @@ WC_OMIT_FRAME_POINTER static void sp_256_mul_d_8(sp_digit* r, const sp_digit* a,
  * Note that this is an approximate div. It may give an answer 1 larger.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static sp_digit div_256_word_8(sp_digit d1_p,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit div_256_word_8(sp_digit d1_p,
     sp_digit d0_p, sp_digit div_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static sp_digit div_256_word_8(sp_digit d1,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit div_256_word_8(sp_digit d1,
     sp_digit d0, sp_digit div)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -39927,8 +40672,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static sp_digit div_256_word_8(sp_digit d1,
         "SUB	%[d0], %[d0], r3\n\t"
         "UDIV	r3, %[d0], %[div]\n\t"
         "ADD	%[d1], r6, r3\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [d1] "+r" (d1), [d0] "+r" (d0), [div] "+r" (div)
         :
+#else
+        :
+        : [d1] "r" (d1), [d0] "r" (d0), [div] "r" (div)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (word32)(size_t)d1;
@@ -39945,10 +40695,10 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static sp_digit div_256_word_8(sp_digit d1,
  * Note that this is an approximate div. It may give an answer 1 larger.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static sp_digit div_256_word_8(sp_digit d1_p,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit div_256_word_8(sp_digit d1_p,
     sp_digit d0_p, sp_digit div_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static sp_digit div_256_word_8(sp_digit d1,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit div_256_word_8(sp_digit d1,
     sp_digit d0, sp_digit div)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -40011,8 +40761,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static sp_digit div_256_word_8(sp_digit d1,
         "SUBS	r8, %[div], r9\n\t"
         "SBC	r8, r8, r8\n\t"
         "SUB	%[d1], r3, r8\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [d1] "+r" (d1), [d0] "+r" (d0), [div] "+r" (div)
         :
+#else
+        :
+        : [d1] "r" (d1), [d0] "r" (d0), [div] "r" (div)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (word32)(size_t)d1;
@@ -40678,8 +41433,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_256_sub_8(sp_digit* r,
         "BNE.N	L_sp_256_sub_8_word_%=\n\t"
 #endif
         "MOV	%[r], r11\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11", "r12"
     );
@@ -40723,8 +41483,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_256_sub_8(sp_digit* r,
         "SBCS	r6, r6, r10\n\t"
         "STM	%[r]!, {r3, r4, r5, r6}\n\t"
         "SBC	%[r], r6, r6\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (word32)(size_t)r;
@@ -40771,8 +41536,13 @@ WC_OMIT_FRAME_POINTER static void sp_256_rshift1_8(sp_digit* r,
         "ORR	r9, r9, r10, LSL #31\n\t"
         "STRD	r6, r7, [%[r]]\n\t"
         "STRD	r8, r9, [%[r], #8]\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11", "r12"
     );
@@ -40866,8 +41636,13 @@ WC_OMIT_FRAME_POINTER static void sp_256_div2_mod_8(sp_digit* r,
         "ORR	r10, r10, r7, LSL #31\n\t"
         "ORR	r11, r11, r3, LSL #31\n\t"
         "STM	%[r], {r8, r9, r10, r11}\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [m] "+r" (m)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [m] "r" (m)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
             "r3", "r12"
     );
@@ -41070,8 +41845,13 @@ WC_OMIT_FRAME_POINTER static int sp_256_num_bits_8(const sp_digit* a)
     "L_sp_256_num_bits_8_9_%=:\n\t"
 #endif
         "MOV	%[a], r4\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a)
         :
+#else
+        :
+        : [a] "r" (a)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r1", "r2", "r3", "r4", "r5"
     );
     return (word32)(size_t)a;
@@ -42127,8 +42907,13 @@ WC_OMIT_FRAME_POINTER static void sp_384_mul_12(sp_digit* r, const sp_digit* a,
 #else
         "BGT.N	L_sp_384_mul_12_store_%=\n\t"
 #endif
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr",
             "r11"
     );
@@ -43158,8 +43943,13 @@ WC_OMIT_FRAME_POINTER static void sp_384_mul_12(sp_digit* r, const sp_digit* a,
         "STM	%[r]!, {r3, r4, r5, r6}\n\t"
         "LDM	sp!, {r3, r4, r5, r6}\n\t"
         "STM	%[r]!, {r3, r4, r5, r6}\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r11",
             "r12"
     );
@@ -43279,8 +44069,13 @@ WC_OMIT_FRAME_POINTER static void sp_384_sqr_12(sp_digit* r, const sp_digit* a)
 #else
         "BGT.N	L_sp_384_sqr_12_store_%=\n\t"
 #endif
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr",
             "r11"
     );
@@ -43982,8 +44777,13 @@ WC_OMIT_FRAME_POINTER static void sp_384_sqr_12(sp_digit* r, const sp_digit* a)
         "STM	%[r]!, {r2, r3, r4, r8}\n\t"
         "LDM	sp!, {r2, r3, r4, r8}\n\t"
         "STM	%[r]!, {r2, r3, r4, r8}\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r12"
     );
@@ -44039,8 +44839,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_384_add_12(sp_digit* r,
         "BNE.N	L_sp_384_add_12_word_%=\n\t"
 #endif
         "MOV	%[r], r3\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
             "r3", "r12"
     );
@@ -44092,8 +44897,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_384_add_12(sp_digit* r,
         "STM	%[r]!, {r3, r4, r5, r6}\n\t"
         "MOV	%[r], #0x0\n\t"
         "ADC	%[r], %[r], #0x0\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (word32)(size_t)r;
@@ -44435,8 +45245,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_384_cond_sub_12(sp_digit* r,
         "BLT.N	L_sp_384_cond_sub_12_words_%=\n\t"
 #endif
         "MOV	%[r], r4\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8"
     );
     return (word32)(size_t)r;
@@ -44511,8 +45326,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_384_cond_sub_12(sp_digit* r,
         "SBCS	r7, r7, r9\n\t"
         "STM	%[r]!, {r6, r7}\n\t"
         "SBC	%[r], r5, r5\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9"
     );
     return (word32)(size_t)r;
@@ -44529,10 +45349,10 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_384_cond_sub_12(sp_digit* r,
  * mp  The digit representing the negative inverse of m mod 2^n.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_384_mont_reduce_12(
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_384_mont_reduce_12(
     sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_384_mont_reduce_12(sp_digit* a,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_384_mont_reduce_12(sp_digit* a,
     const sp_digit* m, sp_digit mp)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -44667,8 +45487,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_384_mont_reduce_12(sp_digit* a,
         "STR	r4, [%[a]]\n\t"
         "STR	r5, [%[a], #4]\n\t"
         "MOV	%[mp], r3\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
         :
+#else
+        :
+        : [a] "r" (a), [m] "r" (m), [mp] "r" (mp)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11", "r12", "lr"
     );
@@ -44683,10 +45508,10 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_384_mont_reduce_12(sp_digit* a,
  * mp  The digit representing the negative inverse of m mod 2^n.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_384_mont_reduce_12(
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_384_mont_reduce_12(
     sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_384_mont_reduce_12(sp_digit* a,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_384_mont_reduce_12(sp_digit* a,
     const sp_digit* m, sp_digit mp)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -44791,8 +45616,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_384_mont_reduce_12(sp_digit* a,
         "STR	r9, [%[a], #12]\n\t"
         "STR	r10, [%[a], #16]\n\t"
         "MOV	%[mp], r5\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
         :
+#else
+        :
+        : [a] "r" (a), [m] "r" (m), [mp] "r" (mp)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11", "r12", "lr"
     );
@@ -45129,8 +45959,13 @@ WC_OMIT_FRAME_POINTER static sp_int32 sp_384_cmp_12(const sp_digit* a,
         "EOR	r2, r2, r3\n\t"
 #endif /*WOLFSSL_SP_SMALL */
         "MOV	%[a], r2\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8"
     );
     return (word32)(size_t)a;
@@ -45190,10 +46025,10 @@ static void sp_384_map_12(sp_point_384* r, const sp_point_384* p,
  * m   Modulus (prime).
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_384_mont_add_12(sp_digit* r_p,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_384_mont_add_12(sp_digit* r_p,
     const sp_digit* a_p, const sp_digit* b_p, const sp_digit* m_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_384_mont_add_12(sp_digit* r,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_384_mont_add_12(sp_digit* r,
     const sp_digit* a, const sp_digit* b, const sp_digit* m)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -45271,8 +46106,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_384_mont_add_12(sp_digit* r,
         "SBCS	r10, r10, r3\n\t"
         "SBC	r11, r11, r3\n\t"
         "STM	%[r]!, {r8, r9, r10, r11}\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
             "r12"
     );
@@ -45285,10 +46125,10 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_384_mont_add_12(sp_digit* r,
  * m   Modulus (prime).
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_384_mont_dbl_12(sp_digit* r_p,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_384_mont_dbl_12(sp_digit* r_p,
     const sp_digit* a_p, const sp_digit* m_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_384_mont_dbl_12(sp_digit* r,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_384_mont_dbl_12(sp_digit* r,
     const sp_digit* a, const sp_digit* m)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -45356,8 +46196,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_384_mont_dbl_12(sp_digit* r,
         "SBCS	r8, r8, r2\n\t"
         "SBC	r9, r9, r2\n\t"
         "STM	%[r]!, {r4, r5, r6, r7, r8, r9}\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [m] "+r" (m)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [m] "r" (m)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r3"
     );
 }
@@ -45369,10 +46214,10 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_384_mont_dbl_12(sp_digit* r,
  * m   Modulus (prime).
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_384_mont_tpl_12(sp_digit* r_p,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_384_mont_tpl_12(sp_digit* r_p,
     const sp_digit* a_p, const sp_digit* m_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_384_mont_tpl_12(sp_digit* r,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_384_mont_tpl_12(sp_digit* r,
     const sp_digit* a, const sp_digit* m)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -45504,8 +46349,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_384_mont_tpl_12(sp_digit* r,
         "SBCS	r8, r8, r2\n\t"
         "SBC	r9, r9, r2\n\t"
         "STM	%[r]!, {r4, r5, r6, r7, r8, r9}\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [m] "+r" (m)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [m] "r" (m)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
             "r3", "r12"
     );
@@ -45555,8 +46405,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_384_sub_12(sp_digit* r,
         "SBCS	r6, r6, r10\n\t"
         "STM	%[r]!, {r3, r4, r5, r6}\n\t"
         "SBC	%[r], r6, r6\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (word32)(size_t)r;
@@ -45614,8 +46469,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_384_cond_add_12(sp_digit* r,
         "BLT.N	L_sp_384_cond_add_12_words_%=\n\t"
 #endif
         "MOV	%[r], r5\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8"
     );
     return (word32)(size_t)r;
@@ -45690,8 +46550,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_384_cond_add_12(sp_digit* r,
         "ADCS	r7, r7, r9\n\t"
         "STM	%[r]!, {r6, r7}\n\t"
         "ADC	%[r], r10, r10\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (word32)(size_t)r;
@@ -45706,10 +46571,10 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_384_cond_add_12(sp_digit* r,
  * m   Modulus (prime).
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_384_mont_sub_12(sp_digit* r_p,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_384_mont_sub_12(sp_digit* r_p,
     const sp_digit* a_p, const sp_digit* b_p, const sp_digit* m_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_384_mont_sub_12(sp_digit* r,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_384_mont_sub_12(sp_digit* r,
     const sp_digit* a, const sp_digit* b, const sp_digit* m)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -45785,8 +46650,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_384_mont_sub_12(sp_digit* r,
         "ADCS	r10, r10, %[m]\n\t"
         "ADC	r11, r11, %[m]\n\t"
         "STM	%[r]!, {r8, r9, r10, r11}\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
             "r12"
     );
@@ -45855,8 +46725,13 @@ WC_OMIT_FRAME_POINTER static void sp_384_rshift1_12(sp_digit* r,
         "LSR	r4, r4, #1\n\t"
         "STR	r3, [%[r], #40]\n\t"
         "STR	r4, [%[r], #44]\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r2", "r3", "r4"
     );
 }
@@ -49266,8 +50141,13 @@ WC_OMIT_FRAME_POINTER static void sp_384_add_one_12(sp_digit* a)
         "ADCS	r3, r3, #0x0\n\t"
         "ADCS	r4, r4, #0x0\n\t"
         "STM	%[a]!, {r1, r2, r3, r4}\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a)
         :
+#else
+        :
+        : [a] "r" (a)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r1", "r2", "r3", "r4"
     );
 }
@@ -49661,8 +50541,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_384_sub_in_place_12(sp_digit* a,
         "BNE.N	L_sp_384_sub_in_place_12_word_%=\n\t"
 #endif
         "MOV	%[a], r10\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11"
     );
@@ -49711,8 +50596,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_384_sub_in_place_12(sp_digit* a,
         "SBCS	r5, r5, r9\n\t"
         "STM	%[a]!, {r2, r3, r4, r5}\n\t"
         "SBC	%[a], r9, r9\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9"
     );
     return (word32)(size_t)a;
@@ -49774,8 +50664,13 @@ WC_OMIT_FRAME_POINTER static void sp_384_mul_d_12(sp_digit* r,
         "BLT.N	L_sp_384_mul_d_12_word_%=\n\t"
 #endif
         "STR	r3, [%[r], #48]\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9"
     );
 }
@@ -49862,8 +50757,13 @@ WC_OMIT_FRAME_POINTER static void sp_384_mul_d_12(sp_digit* r,
         "UMLAL	r5, r3, %[b], r8\n\t"
         "STM	%[r]!, {r5}\n\t"
         "STR	r3, [%[r]]\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8"
     );
 }
@@ -49880,10 +50780,10 @@ WC_OMIT_FRAME_POINTER static void sp_384_mul_d_12(sp_digit* r,
  * Note that this is an approximate div. It may give an answer 1 larger.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static sp_digit div_384_word_12(sp_digit d1_p,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit div_384_word_12(sp_digit d1_p,
     sp_digit d0_p, sp_digit div_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static sp_digit div_384_word_12(sp_digit d1,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit div_384_word_12(sp_digit d1,
     sp_digit d0, sp_digit div)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -49929,8 +50829,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static sp_digit div_384_word_12(sp_digit d1,
         "SUB	%[d0], %[d0], r3\n\t"
         "UDIV	r3, %[d0], %[div]\n\t"
         "ADD	%[d1], r6, r3\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [d1] "+r" (d1), [d0] "+r" (d0), [div] "+r" (div)
         :
+#else
+        :
+        : [d1] "r" (d1), [d0] "r" (d0), [div] "r" (div)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (word32)(size_t)d1;
@@ -49947,10 +50852,10 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static sp_digit div_384_word_12(sp_digit d1,
  * Note that this is an approximate div. It may give an answer 1 larger.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static sp_digit div_384_word_12(sp_digit d1_p,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit div_384_word_12(sp_digit d1_p,
     sp_digit d0_p, sp_digit div_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static sp_digit div_384_word_12(sp_digit d1,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit div_384_word_12(sp_digit d1,
     sp_digit d0, sp_digit div)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -50013,8 +50918,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static sp_digit div_384_word_12(sp_digit d1,
         "SUBS	r8, %[div], r9\n\t"
         "SBC	r8, r8, r8\n\t"
         "SUB	%[d1], r3, r8\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [d1] "+r" (d1), [d0] "+r" (d0), [div] "+r" (div)
         :
+#else
+        :
+        : [d1] "r" (d1), [d0] "r" (d0), [div] "r" (div)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (word32)(size_t)d1;
@@ -50729,8 +51639,13 @@ WC_OMIT_FRAME_POINTER static void sp_384_div2_mod_12(sp_digit* r,
         "ORR	r10, r10, r3, LSL #31\n\t"
         "STR	r9, [%[r], #40]\n\t"
         "STR	r10, [%[r], #44]\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [m] "+r" (m)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [m] "r" (m)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
             "r3", "r12"
     );
@@ -51033,8 +51948,13 @@ WC_OMIT_FRAME_POINTER static int sp_384_num_bits_12(const sp_digit* a)
     "L_sp_384_num_bits_12_13_%=:\n\t"
 #endif
         "MOV	%[a], r4\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a)
         :
+#else
+        :
+        : [a] "r" (a)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r1", "r2", "r3", "r4", "r5"
     );
     return (word32)(size_t)a;
@@ -52140,8 +53060,13 @@ WC_OMIT_FRAME_POINTER static void sp_521_mul_17(sp_digit* r, const sp_digit* a,
 #else
         "BGT.N	L_sp_521_mul_17_store_%=\n\t"
 #endif
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr",
             "r11"
     );
@@ -54185,8 +55110,13 @@ WC_OMIT_FRAME_POINTER static void sp_521_mul_17(sp_digit* r, const sp_digit* a,
         "STM	%[r]!, {r3, r4, r5, r6}\n\t"
         "LDM	sp!, {r3}\n\t"
         "STM	%[r]!, {r3}\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r11",
             "r12"
     );
@@ -54309,8 +55239,13 @@ WC_OMIT_FRAME_POINTER static void sp_521_sqr_17(sp_digit* r, const sp_digit* a)
 #else
         "BGT.N	L_sp_521_sqr_17_store_%=\n\t"
 #endif
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr",
             "r11"
     );
@@ -55596,8 +56531,13 @@ WC_OMIT_FRAME_POINTER static void sp_521_sqr_17(sp_digit* r, const sp_digit* a)
         "STM	%[r]!, {r2, r3, r4, r8}\n\t"
         "LDM	sp!, {r2}\n\t"
         "STM	%[r]!, {r2}\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r12"
     );
@@ -55659,8 +56599,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_521_add_17(sp_digit* r,
         "STM	%[r]!, {r4}\n\t"
         "MOV	r4, #0x0\n\t"
         "ADC	%[r], r4, #0x0\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
             "r3", "r12"
     );
@@ -55723,8 +56668,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_521_add_17(sp_digit* r,
         "STM	%[r]!, {r3}\n\t"
         "MOV	%[r], #0x0\n\t"
         "ADC	%[r], %[r], #0x0\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (word32)(size_t)r;
@@ -55995,8 +56945,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_521_cond_sub_17(sp_digit* r,
         "BLT.N	L_sp_521_cond_sub_17_words_%=\n\t"
 #endif
         "MOV	%[r], r4\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8"
     );
     return (word32)(size_t)r;
@@ -56090,8 +57045,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_521_cond_sub_17(sp_digit* r,
         "SBCS	r6, r6, r8\n\t"
         "STR	r6, [%[r]]\n\t"
         "SBC	%[r], r5, r5\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9"
     );
     return (word32)(size_t)r;
@@ -56105,10 +57065,10 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_521_cond_sub_17(sp_digit* r,
  * mp  The digit representing the negative inverse of m mod 2^n.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_521_mont_reduce_17(
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_521_mont_reduce_17(
     sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_521_mont_reduce_17(sp_digit* a,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_521_mont_reduce_17(sp_digit* a,
     const sp_digit* m, sp_digit mp)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -56220,8 +57180,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_521_mont_reduce_17(sp_digit* a,
         "ADCS	r7, r7, #0x0\n\t"
         "ADCS	r8, r8, #0x0\n\t"
         "STM	%[a]!, {r1, r2, r3, r4, r5, r6, r7, r8}\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a)
         :
+#else
+        :
+        : [a] "r" (a)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9",
             "r10", "r11", "r12", "lr"
     );
@@ -56245,10 +57210,10 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_521_mont_reduce_17(sp_digit* a,
  * mp  The digit representing the negative inverse of m mod 2^n.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_521_mont_reduce_order_17(
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_521_mont_reduce_order_17(
     sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_521_mont_reduce_order_17(
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_521_mont_reduce_order_17(
     sp_digit* a, const sp_digit* m, sp_digit mp)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -56510,8 +57475,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_521_mont_reduce_order_17(
         "LSR	r3, r6, #9\n\t"
         "ADD	%[a], %[a], #0x4\n\t"
         "MOV	%[mp], r3\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
         :
+#else
+        :
+        : [a] "r" (a), [m] "r" (m), [mp] "r" (mp)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11", "r12", "lr"
     );
@@ -56526,10 +57496,10 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_521_mont_reduce_order_17(
  * mp  The digit representing the negative inverse of m mod 2^n.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_521_mont_reduce_order_17(
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_521_mont_reduce_order_17(
     sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_521_mont_reduce_order_17(
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_521_mont_reduce_order_17(
     sp_digit* a, const sp_digit* m, sp_digit mp)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -56746,8 +57716,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_521_mont_reduce_order_17(
         "LSR	r5, r12, #9\n\t"
         "ADD	%[a], %[a], #0x4\n\t"
         "MOV	%[mp], r5\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
         :
+#else
+        :
+        : [a] "r" (a), [m] "r" (m), [mp] "r" (mp)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11", "r12", "lr"
     );
@@ -57136,8 +58111,13 @@ WC_OMIT_FRAME_POINTER static sp_int32 sp_521_cmp_17(const sp_digit* a,
         "EOR	r2, r2, r3\n\t"
 #endif /*WOLFSSL_SP_SMALL */
         "MOV	%[a], r2\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8"
     );
     return (word32)(size_t)a;
@@ -57197,10 +58177,10 @@ static void sp_521_map_17(sp_point_521* r, const sp_point_521* p,
  * m   Modulus (prime).
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_521_mont_add_17(sp_digit* r_p,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_521_mont_add_17(sp_digit* r_p,
     const sp_digit* a_p, const sp_digit* b_p, const sp_digit* m_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_521_mont_add_17(sp_digit* r,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_521_mont_add_17(sp_digit* r,
     const sp_digit* a, const sp_digit* b, const sp_digit* m)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -57271,8 +58251,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_521_mont_add_17(sp_digit* r,
         "LDM	%[r], {r4}\n\t"
         "ADCS	r4, r4, #0x0\n\t"
         "STM	%[r]!, {r4}\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
             "r3", "r12"
     );
@@ -57290,10 +58275,10 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_521_mont_add_17(sp_digit* r,
  * m   Modulus (prime).
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_521_mont_dbl_17(sp_digit* r_p,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_521_mont_dbl_17(sp_digit* r_p,
     const sp_digit* a_p, const sp_digit* m_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_521_mont_dbl_17(sp_digit* r,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_521_mont_dbl_17(sp_digit* r,
     const sp_digit* a, const sp_digit* m)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -57354,8 +58339,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_521_mont_dbl_17(sp_digit* r,
         "LDM	%[r], {r4}\n\t"
         "ADCS	r4, r4, #0x0\n\t"
         "STM	%[r]!, {r4}\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
             "r2", "r3"
     );
@@ -57373,10 +58363,10 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_521_mont_dbl_17(sp_digit* r,
  * m   Modulus (prime).
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_521_mont_tpl_17(sp_digit* r_p,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_521_mont_tpl_17(sp_digit* r_p,
     const sp_digit* a_p, const sp_digit* m_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_521_mont_tpl_17(sp_digit* r,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_521_mont_tpl_17(sp_digit* r,
     const sp_digit* a, const sp_digit* m)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -57471,8 +58461,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_521_mont_tpl_17(sp_digit* r,
         "LDM	%[r], {r4}\n\t"
         "ADCS	r4, r4, #0x0\n\t"
         "STM	%[r]!, {r4}\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
             "r2", "r3"
     );
@@ -57491,10 +58486,10 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_521_mont_tpl_17(sp_digit* r,
  * m   Modulus (prime).
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_521_mont_sub_17(sp_digit* r_p,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_521_mont_sub_17(sp_digit* r_p,
     const sp_digit* a_p, const sp_digit* b_p, const sp_digit* m_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_521_mont_sub_17(sp_digit* r,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_521_mont_sub_17(sp_digit* r,
     const sp_digit* a, const sp_digit* b, const sp_digit* m)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -57567,8 +58562,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_521_mont_sub_17(sp_digit* r,
         "LDM	%[r], {r4}\n\t"
         "SBCS	r4, r4, #0x0\n\t"
         "STM	%[r]!, {r4}\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
             "r12"
     );
@@ -57654,8 +58654,13 @@ WC_OMIT_FRAME_POINTER static void sp_521_rshift1_17(sp_digit* r,
         "LSR	r3, r3, #1\n\t"
         "STR	r2, [%[r], #60]\n\t"
         "STR	r3, [%[r], #64]\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r2", "r3", "r4"
     );
 }
@@ -61706,8 +62711,13 @@ WC_OMIT_FRAME_POINTER static void sp_521_add_one_17(sp_digit* a)
         "LDM	%[a], {r1}\n\t"
         "ADCS	r1, r1, #0x0\n\t"
         "STM	%[a]!, {r1}\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a)
         :
+#else
+        :
+        : [a] "r" (a)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r1", "r2", "r3", "r4"
     );
 }
@@ -62151,8 +63161,13 @@ WC_OMIT_FRAME_POINTER static void sp_521_rshift_17(sp_digit* r,
         "LSR	r5, r5, %[n]\n\t"
         "ORR	r4, r4, r3\n\t"
         "STRD	r4, r5, [%[r], #60]\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [n] "+r" (n)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [n] "r" (n)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r4", "r5", "r6", "r3", "r7"
     );
 }
@@ -62279,8 +63294,13 @@ WC_OMIT_FRAME_POINTER static void sp_521_lshift_17(sp_digit* r,
         "ORR	r5, r5, r3\n\t"
         "STR	r4, [%[r]]\n\t"
         "STR	r5, [%[r], #4]\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [n] "+r" (n)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [n] "r" (n)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r4", "r5", "r6", "r3", "r7"
     );
 }
@@ -62505,8 +63525,13 @@ WC_OMIT_FRAME_POINTER static void sp_521_lshift_34(sp_digit* r,
         "ORR	r6, r6, r3\n\t"
         "STR	r5, [%[r]]\n\t"
         "STR	r6, [%[r], #4]\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [n] "+r" (n)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [n] "r" (n)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r4", "r5", "r6", "r3", "r7"
     );
 }
@@ -62562,8 +63587,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_521_sub_in_place_17(sp_digit* a,
         "SBCS	r2, r2, r6\n\t"
         "STM	%[a]!, {r2}\n\t"
         "SBC	%[a], %[a], %[a]\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11"
     );
@@ -62623,8 +63653,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_521_sub_in_place_17(sp_digit* a,
         "SBCS	r2, r2, r6\n\t"
         "STM	%[a]!, {r2}\n\t"
         "SBC	%[a], r9, r9\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9"
     );
     return (word32)(size_t)a;
@@ -62686,8 +63721,13 @@ WC_OMIT_FRAME_POINTER static void sp_521_mul_d_17(sp_digit* r,
         "BLT.N	L_sp_521_mul_d_17_word_%=\n\t"
 #endif
         "STR	r3, [%[r], #68]\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9"
     );
 }
@@ -62799,8 +63839,13 @@ WC_OMIT_FRAME_POINTER static void sp_521_mul_d_17(sp_digit* r,
         "UMLAL	r4, r5, %[b], r8\n\t"
         "STM	%[r]!, {r4}\n\t"
         "STR	r5, [%[r]]\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8"
     );
 }
@@ -62817,10 +63862,10 @@ WC_OMIT_FRAME_POINTER static void sp_521_mul_d_17(sp_digit* r,
  * Note that this is an approximate div. It may give an answer 1 larger.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static sp_digit div_521_word_17(sp_digit d1_p,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit div_521_word_17(sp_digit d1_p,
     sp_digit d0_p, sp_digit div_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static sp_digit div_521_word_17(sp_digit d1,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit div_521_word_17(sp_digit d1,
     sp_digit d0, sp_digit div)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -62866,8 +63911,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static sp_digit div_521_word_17(sp_digit d1,
         "SUB	%[d0], %[d0], r3\n\t"
         "UDIV	r3, %[d0], %[div]\n\t"
         "ADD	%[d1], r6, r3\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [d1] "+r" (d1), [d0] "+r" (d0), [div] "+r" (div)
         :
+#else
+        :
+        : [d1] "r" (d1), [d0] "r" (d0), [div] "r" (div)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (word32)(size_t)d1;
@@ -62884,10 +63934,10 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static sp_digit div_521_word_17(sp_digit d1,
  * Note that this is an approximate div. It may give an answer 1 larger.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static sp_digit div_521_word_17(sp_digit d1_p,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit div_521_word_17(sp_digit d1_p,
     sp_digit d0_p, sp_digit div_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static sp_digit div_521_word_17(sp_digit d1,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit div_521_word_17(sp_digit d1,
     sp_digit d0, sp_digit div)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -62950,8 +64000,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static sp_digit div_521_word_17(sp_digit d1,
         "SUBS	r8, %[div], r9\n\t"
         "SBC	r8, r8, r8\n\t"
         "SUB	%[d1], r3, r8\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [d1] "+r" (d1), [d0] "+r" (d0), [div] "+r" (div)
         :
+#else
+        :
+        : [d1] "r" (d1), [d0] "r" (d0), [div] "r" (div)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (word32)(size_t)d1;
@@ -63623,8 +64678,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_521_sub_17(sp_digit* r,
         "SBCS	r3, r3, r7\n\t"
         "STM	%[r]!, {r3}\n\t"
         "SBC	%[r], r6, r6\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11", "r12"
     );
@@ -63686,8 +64746,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_521_sub_17(sp_digit* r,
         "SBCS	r3, r3, r7\n\t"
         "STM	%[r]!, {r3}\n\t"
         "SBC	%[r], r6, r6\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (word32)(size_t)r;
@@ -63855,8 +64920,13 @@ WC_OMIT_FRAME_POINTER static void sp_521_div2_mod_17(sp_digit* r,
         "ORR	r9, r9, r3, LSL #31\n\t"
         "STR	r8, [%[r], #60]\n\t"
         "STR	r9, [%[r], #64]\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [m] "+r" (m)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [m] "r" (m)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
             "r3", "r12"
     );
@@ -64284,8 +65354,13 @@ WC_OMIT_FRAME_POINTER static int sp_521_num_bits_17(const sp_digit* a)
     "L_sp_521_num_bits_17_18_%=:\n\t"
 #endif
         "MOV	%[a], r4\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a)
         :
+#else
+        :
+        : [a] "r" (a)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r1", "r2", "r3", "r4", "r5"
     );
     return (word32)(size_t)a;
@@ -66952,8 +68027,13 @@ WC_OMIT_FRAME_POINTER static void sp_1024_mul_16(sp_digit* r, const sp_digit* a,
         "STM	%[r]!, {r3, r4, r5, r6}\n\t"
         "LDM	sp!, {r3, r4, r5, r6}\n\t"
         "STM	%[r]!, {r3, r4, r5, r6}\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r11",
             "r12"
     );
@@ -68106,8 +69186,13 @@ WC_OMIT_FRAME_POINTER static void sp_1024_sqr_16(sp_digit* r, const sp_digit* a)
         "STM	%[r]!, {r2, r3, r4, r8}\n\t"
         "LDM	sp!, {r2, r3, r4, r8}\n\t"
         "STM	%[r]!, {r2, r3, r4, r8}\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r12"
     );
@@ -68164,8 +69249,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_1024_add_16(sp_digit* r,
         "STM	%[r]!, {r3, r4, r5, r6}\n\t"
         "MOV	%[r], #0x0\n\t"
         "ADC	%[r], %[r], #0x0\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (word32)(size_t)r;
@@ -68247,8 +69337,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_1024_sub_in_place_32(sp_digit* a,
         "SBCS	r5, r5, r9\n\t"
         "STM	%[a]!, {r2, r3, r4, r5}\n\t"
         "SBC	%[a], r9, r9\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9"
     );
     return (word32)(size_t)a;
@@ -68333,8 +69428,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_1024_add_32(sp_digit* r,
         "STM	%[r]!, {r3, r4, r5, r6}\n\t"
         "MOV	%[r], #0x0\n\t"
         "ADC	%[r], %[r], #0x0\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (word32)(size_t)r;
@@ -68459,8 +69559,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_1024_sub_16(sp_digit* r,
         "SBCS	r6, r6, r10\n\t"
         "STM	%[r]!, {r3, r4, r5, r6}\n\t"
         "SBC	%[r], r6, r6\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (word32)(size_t)r;
@@ -68624,8 +69729,13 @@ WC_OMIT_FRAME_POINTER static void sp_1024_mul_32(sp_digit* r, const sp_digit* a,
 #else
         "BGT.N	L_sp_1024_mul_32_store_%=\n\t"
 #endif
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr",
             "r11"
     );
@@ -68743,8 +69853,13 @@ WC_OMIT_FRAME_POINTER static void sp_1024_sqr_32(sp_digit* r, const sp_digit* a)
 #else
         "BGT.N	L_sp_1024_sqr_32_store_%=\n\t"
 #endif
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr",
             "r11"
     );
@@ -68882,8 +69997,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_1024_sub_in_place_32(sp_digit* a,
         "BNE.N	L_sp_1024_sub_in_place_32_word_%=\n\t"
 #endif
         "MOV	%[a], r10\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11"
     );
@@ -68942,8 +70062,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_1024_cond_sub_32(sp_digit* r,
         "BLT.N	L_sp_1024_cond_sub_32_words_%=\n\t"
 #endif
         "MOV	%[r], r4\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8"
     );
     return (word32)(size_t)r;
@@ -69088,8 +70213,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_1024_cond_sub_32(sp_digit* r,
         "SBCS	r7, r7, r9\n\t"
         "STM	%[r]!, {r6, r7}\n\t"
         "SBC	%[r], r5, r5\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9"
     );
     return (word32)(size_t)r;
@@ -69145,8 +70275,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_1024_add_32(sp_digit* r,
         "BNE.N	L_sp_1024_add_32_word_%=\n\t"
 #endif
         "MOV	%[r], r3\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
             "r3", "r12"
     );
@@ -69209,8 +70344,13 @@ WC_OMIT_FRAME_POINTER static void sp_1024_mul_d_32(sp_digit* r,
         "BLT.N	L_sp_1024_mul_d_32_word_%=\n\t"
 #endif
         "STR	r3, [%[r], #128]\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9"
     );
 }
@@ -69397,8 +70537,13 @@ WC_OMIT_FRAME_POINTER static void sp_1024_mul_d_32(sp_digit* r,
         "UMLAL	r4, r5, %[b], r8\n\t"
         "STM	%[r]!, {r4}\n\t"
         "STR	r5, [%[r]]\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8"
     );
 }
@@ -69415,10 +70560,10 @@ WC_OMIT_FRAME_POINTER static void sp_1024_mul_d_32(sp_digit* r,
  * Note that this is an approximate div. It may give an answer 1 larger.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static sp_digit div_1024_word_32(
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit div_1024_word_32(
     sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static sp_digit div_1024_word_32(sp_digit d1,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit div_1024_word_32(sp_digit d1,
     sp_digit d0, sp_digit div)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -69464,8 +70609,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static sp_digit div_1024_word_32(sp_digit d1,
         "SUB	%[d0], %[d0], r3\n\t"
         "UDIV	r3, %[d0], %[div]\n\t"
         "ADD	%[d1], r6, r3\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [d1] "+r" (d1), [d0] "+r" (d0), [div] "+r" (div)
         :
+#else
+        :
+        : [d1] "r" (d1), [d0] "r" (d0), [div] "r" (div)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (word32)(size_t)d1;
@@ -69482,10 +70632,10 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static sp_digit div_1024_word_32(sp_digit d1,
  * Note that this is an approximate div. It may give an answer 1 larger.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static sp_digit div_1024_word_32(
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit div_1024_word_32(
     sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static sp_digit div_1024_word_32(sp_digit d1,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE sp_digit div_1024_word_32(sp_digit d1,
     sp_digit d0, sp_digit div)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -69548,8 +70698,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static sp_digit div_1024_word_32(sp_digit d1,
         "SUBS	r8, %[div], r9\n\t"
         "SBC	r8, r8, r8\n\t"
         "SUB	%[d1], r3, r8\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [d1] "+r" (d1), [d0] "+r" (d0), [div] "+r" (div)
         :
+#else
+        :
+        : [d1] "r" (d1), [d0] "r" (d0), [div] "r" (div)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (word32)(size_t)d1;
@@ -69993,8 +71148,13 @@ WC_OMIT_FRAME_POINTER static sp_int32 sp_1024_cmp_32(const sp_digit* a,
         "EOR	r2, r2, r3\n\t"
 #endif /*WOLFSSL_SP_SMALL */
         "MOV	%[a], r2\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a), [b] "+r" (b)
         :
+#else
+        :
+        : [a] "r" (a), [b] "r" (b)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8"
     );
     return (word32)(size_t)a;
@@ -70336,10 +71496,10 @@ static int sp_1024_point_to_ecc_point_32(const sp_point_1024* p, ecc_point* pm)
  * mp  The digit representing the negative inverse of m mod 2^n.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_1024_mont_reduce_32(
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_1024_mont_reduce_32(
     sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_1024_mont_reduce_32(
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_1024_mont_reduce_32(
     sp_digit* a, const sp_digit* m, sp_digit mp)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -70639,8 +71799,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_1024_mont_reduce_32(
         "SBC	r12, r12, r12\n\t"
         "ORR	r3, r3, r12\n\t"
         "MOV	%[mp], r3\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
         :
+#else
+        :
+        : [a] "r" (a), [m] "r" (m), [mp] "r" (mp)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11", "r12", "lr"
     );
@@ -70655,10 +71820,10 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_1024_mont_reduce_32(
  * mp  The digit representing the negative inverse of m mod 2^n.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_1024_mont_reduce_32(
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_1024_mont_reduce_32(
     sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_1024_mont_reduce_32(
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_1024_mont_reduce_32(
     sp_digit* a, const sp_digit* m, sp_digit mp)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -70868,8 +72033,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_1024_mont_reduce_32(
         "SBC	r3, r3, r3\n\t"
         "ORR	r5, r5, r3\n\t"
         "MOV	%[mp], r5\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
         :
+#else
+        :
+        : [a] "r" (a), [m] "r" (m), [mp] "r" (mp)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11", "r12", "lr"
     );
@@ -71022,10 +72192,10 @@ static void sp_1024_map_32(sp_point_1024* r, const sp_point_1024* p,
  * m   Modulus (prime).
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_1024_mont_add_32(sp_digit* r_p,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_1024_mont_add_32(sp_digit* r_p,
     const sp_digit* a_p, const sp_digit* b_p, const sp_digit* m_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_1024_mont_add_32(sp_digit* r,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_1024_mont_add_32(sp_digit* r,
     const sp_digit* a, const sp_digit* b, const sp_digit* m)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -71189,8 +72359,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_1024_mont_add_32(sp_digit* r,
         "SBCS	r6, r6, r10\n\t"
         "SBC	r7, r7, r11\n\t"
         "STM	%[r]!, {r4, r5, r6, r7}\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
             "r12"
     );
@@ -71203,10 +72378,10 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_1024_mont_add_32(sp_digit* r,
  * m   Modulus (prime).
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_1024_mont_dbl_32(sp_digit* r_p,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_1024_mont_dbl_32(sp_digit* r_p,
     const sp_digit* a_p, const sp_digit* m_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_1024_mont_dbl_32(sp_digit* r,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_1024_mont_dbl_32(sp_digit* r,
     const sp_digit* a, const sp_digit* m)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -71353,8 +72528,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_1024_mont_dbl_32(sp_digit* r,
         "SBCS	r6, r6, r10\n\t"
         "SBC	r7, r7, r11\n\t"
         "STM	%[r]!, {r4, r5, r6, r7}\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [m] "+r" (m)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [m] "r" (m)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r8", "r9", "r10", "r11", "r4", "r5", "r6", "r7",
             "r12"
     );
@@ -71367,10 +72547,10 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_1024_mont_dbl_32(sp_digit* r,
  * m   Modulus (prime).
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_1024_mont_tpl_32(sp_digit* r_p,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_1024_mont_tpl_32(sp_digit* r_p,
     const sp_digit* a_p, const sp_digit* m_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_1024_mont_tpl_32(sp_digit* r,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_1024_mont_tpl_32(sp_digit* r,
     const sp_digit* a, const sp_digit* m)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -71672,8 +72852,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_1024_mont_tpl_32(sp_digit* r,
         "SBCS	r6, r6, r10\n\t"
         "SBC	r7, r7, r11\n\t"
         "STM	%[r]!, {r4, r5, r6, r7}\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [m] "+r" (m)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [m] "r" (m)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r8", "r9", "r10", "r11", "r4", "r5", "r6", "r7",
             "r12"
     );
@@ -71687,10 +72872,10 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_1024_mont_tpl_32(sp_digit* r,
  * m   Modulus (prime).
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_1024_mont_sub_32(sp_digit* r_p,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_1024_mont_sub_32(sp_digit* r_p,
     const sp_digit* a_p, const sp_digit* b_p, const sp_digit* m_p)
 #else
-WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_1024_mont_sub_32(sp_digit* r,
+WC_OMIT_FRAME_POINTER static SP_NOINLINE void sp_1024_mont_sub_32(sp_digit* r,
     const sp_digit* a, const sp_digit* b, const sp_digit* m)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
@@ -71848,8 +73033,13 @@ WC_OMIT_FRAME_POINTER SP_NOINLINE static void sp_1024_mont_sub_32(sp_digit* r,
         "ADCS	r6, r6, r10\n\t"
         "ADC	r7, r7, r11\n\t"
         "STM	%[r]!, {r4, r5, r6, r7}\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
             "r12"
     );
@@ -71906,8 +73096,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_1024_cond_add_32(sp_digit* r,
         "BLT.N	L_sp_1024_cond_add_32_words_%=\n\t"
 #endif
         "MOV	%[r], r5\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8"
     );
     return (word32)(size_t)r;
@@ -72052,8 +73247,13 @@ WC_OMIT_FRAME_POINTER static sp_digit sp_1024_cond_add_32(sp_digit* r,
         "ADCS	r7, r7, r9\n\t"
         "STM	%[r]!, {r6, r7}\n\t"
         "ADC	%[r], r10, r10\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (word32)(size_t)r;
@@ -72200,8 +73400,13 @@ WC_OMIT_FRAME_POINTER static void sp_1024_rshift1_32(sp_digit* r,
         "LSR	r3, r3, #1\n\t"
         "STR	r2, [%[r], #120]\n\t"
         "STR	r3, [%[r], #124]\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         : [r] "+r" (r), [a] "+r" (a)
         :
+#else
+        :
+        : [r] "r" (r), [a] "r" (a)
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "cc", "r2", "r3", "r4"
     );
 }
diff --git a/wolfcrypt/src/sp_x86_64_asm.S b/wolfcrypt/src/sp_x86_64_asm.S
index 1e907aee460..7aaae4bb60d 100644
--- a/wolfcrypt/src/sp_x86_64_asm.S
+++ b/wolfcrypt/src/sp_x86_64_asm.S
@@ -56771,10 +56771,10 @@ _sp_256_mont_div2_4:
         adcq	%r10, %r8
         movq	$0x00, %r11
         adcq	$0x00, %r11
-        shrdq	$0x01, %rax, %rdx
-        shrdq	$0x01, %rcx, %rax
-        shrdq	$0x01, %r8, %rcx
-        shrdq	$0x01, %r11, %r8
+        shrdq	$1, %rax, %rdx
+        shrdq	$1, %rcx, %rax
+        shrdq	$1, %r8, %rcx
+        shrdq	$1, %r11, %r8
         movq	%rdx, (%rdi)
         movq	%rax, 8(%rdi)
         movq	%rcx, 16(%rdi)
@@ -57615,10 +57615,10 @@ _sp_256_mont_div2_avx2_4:
         adcq	%r10, %r8
         movq	$0x00, %r11
         adcq	$0x00, %r11
-        shrdq	$0x01, %rax, %rdx
-        shrdq	$0x01, %rcx, %rax
-        shrdq	$0x01, %r8, %rcx
-        shrdq	$0x01, %r11, %r8
+        shrdq	$1, %rax, %rdx
+        shrdq	$1, %rcx, %rax
+        shrdq	$1, %r8, %rcx
+        shrdq	$1, %r11, %r8
         movq	%rdx, (%rdi)
         movq	%rax, 8(%rdi)
         movq	%rcx, 16(%rdi)
@@ -58754,10 +58754,10 @@ _sp_256_mod_inv_4:
         testb	$0x01, %r11b
         jnz	L_256_mod_inv_4_v_even_end
 L_256_mod_inv_4_v_even_start:
-        shrdq	$0x01, %r12, %r11
-        shrdq	$0x01, %r13, %r12
-        shrdq	$0x01, %r14, %r13
-        shrq	$0x01, %r14
+        shrdq	$1, %r12, %r11
+        shrdq	$1, %r13, %r12
+        shrdq	$1, %r14, %r13
+        shrq	$1, %r14
         movb	$0x01, (%rsp,%r15,1)
         incq	%r15
         testb	$0x01, %r11b
@@ -58782,17 +58782,17 @@ L_256_mod_inv_4_uv_u:
         sbbq	%r12, %r8
         sbbq	%r13, %r9
         sbbq	%r14, %r10
-        shrdq	$0x01, %r8, %rcx
-        shrdq	$0x01, %r9, %r8
-        shrdq	$0x01, %r10, %r9
-        shrq	$0x01, %r10
+        shrdq	$1, %r8, %rcx
+        shrdq	$1, %r9, %r8
+        shrdq	$1, %r10, %r9
+        shrq	$1, %r10
         testb	$0x01, %cl
         jnz	L_256_mod_inv_4_usubv_even_end
 L_256_mod_inv_4_usubv_even_start:
-        shrdq	$0x01, %r8, %rcx
-        shrdq	$0x01, %r9, %r8
-        shrdq	$0x01, %r10, %r9
-        shrq	$0x01, %r10
+        shrdq	$1, %r8, %rcx
+        shrdq	$1, %r9, %r8
+        shrdq	$1, %r10, %r9
+        shrq	$1, %r10
         movb	$0x00, (%rsp,%r15,1)
         incq	%r15
         testb	$0x01, %cl
@@ -58814,17 +58814,17 @@ L_256_mod_inv_4_uv_v:
         sbbq	%r8, %r12
         sbbq	%r9, %r13
         sbbq	%r10, %r14
-        shrdq	$0x01, %r12, %r11
-        shrdq	$0x01, %r13, %r12
-        shrdq	$0x01, %r14, %r13
-        shrq	$0x01, %r14
+        shrdq	$1, %r12, %r11
+        shrdq	$1, %r13, %r12
+        shrdq	$1, %r14, %r13
+        shrq	$1, %r14
         testb	$0x01, %r11b
         jnz	L_256_mod_inv_4_vsubu_even_end
 L_256_mod_inv_4_vsubu_even_start:
-        shrdq	$0x01, %r12, %r11
-        shrdq	$0x01, %r13, %r12
-        shrdq	$0x01, %r14, %r13
-        shrq	$0x01, %r14
+        shrdq	$1, %r12, %r11
+        shrdq	$1, %r13, %r12
+        shrdq	$1, %r14, %r13
+        shrq	$1, %r14
         movb	$0x01, (%rsp,%r15,1)
         incq	%r15
         testb	$0x01, %r11b
@@ -58877,10 +58877,10 @@ L_256_mod_inv_4_op_div2_b:
         adcq	24(%rdx), %r10
         adcq	$0x00, %rsi
 L_256_mod_inv_4_op_div2_b_mod:
-        shrdq	$0x01, %r8, %rcx
-        shrdq	$0x01, %r9, %r8
-        shrdq	$0x01, %r10, %r9
-        shrdq	$0x01, %rsi, %r10
+        shrdq	$1, %r8, %rcx
+        shrdq	$1, %r9, %r8
+        shrdq	$1, %r10, %r9
+        shrdq	$1, %rsi, %r10
         movb	(%rsp,%r15,1), %sil
         incq	%r15
         cmpb	$0x01, %sil
@@ -58910,10 +58910,10 @@ L_256_mod_inv_4_op_div2_d:
         adcq	24(%rdx), %r14
         adcq	$0x00, %rsi
 L_256_mod_inv_4_op_div2_d_mod:
-        shrdq	$0x01, %r12, %r11
-        shrdq	$0x01, %r13, %r12
-        shrdq	$0x01, %r14, %r13
-        shrdq	$0x01, %rsi, %r14
+        shrdq	$1, %r12, %r11
+        shrdq	$1, %r13, %r12
+        shrdq	$1, %r14, %r13
+        shrdq	$1, %rsi, %r14
         movb	(%rsp,%r15,1), %sil
         incq	%r15
         cmpb	$0x01, %sil
@@ -58951,6 +58951,11 @@ L_256_mod_inv_4_store_end:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_sp256_mod_inv_avx2_4_order:
 .long	0x00632551,0x01e84f3b,0x03bce6fa,0x03ffffff
 .long	0x03ff0000,0x00000000,0x00000000,0x00000000
@@ -58967,13 +58972,18 @@ L_sp256_mod_inv_avx2_4_order:
 .p2align	5
 #endif /* __APPLE__ */
 L_sp256_mod_inv_avx2_4_one:
-.quad	0x1, 0x0
-.quad	0x0, 0x0
+.quad	0x0000000000000001,0x0000000000000000
+.quad	0x0000000000000000,0x0000000000000000
 #ifndef __APPLE__
 .data
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_sp256_mod_inv_avx2_4_all_one:
 .long	0x00000001,0x00000001,0x00000001,0x00000001
 .long	0x00000001,0x00000001,0x00000001,0x00000001
@@ -58982,6 +58992,11 @@ L_sp256_mod_inv_avx2_4_all_one:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_sp256_mod_inv_avx2_4_mask01111:
 .long	0x00000000,0x00000001,0x00000001,0x00000001
 .long	0x00000001,0x00000000,0x00000000,0x00000000
@@ -58990,6 +59005,11 @@ L_sp256_mod_inv_avx2_4_mask01111:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_sp256_mod_inv_avx2_4_down_one_dword:
 .long	0x00000001,0x00000002,0x00000003,0x00000004
 .long	0x00000005,0x00000006,0x00000007,0x00000007
@@ -58998,6 +59018,11 @@ L_sp256_mod_inv_avx2_4_down_one_dword:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_sp256_mod_inv_avx2_4_neg:
 .long	0x00000000,0x00000000,0x00000000,0x00000000
 .long	0x80000000,0x00000000,0x00000000,0x00000000
@@ -59006,6 +59031,11 @@ L_sp256_mod_inv_avx2_4_neg:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_sp256_mod_inv_avx2_4_up_one_dword:
 .long	0x00000007,0x00000000,0x00000001,0x00000002
 .long	0x00000003,0x00000007,0x00000007,0x00000007
@@ -59014,6 +59044,11 @@ L_sp256_mod_inv_avx2_4_up_one_dword:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_sp256_mod_inv_avx2_4_mask26:
 .long	0x03ffffff,0x03ffffff,0x03ffffff,0x03ffffff
 .long	0x03ffffff,0x00000000,0x00000000,0x00000000
@@ -59073,10 +59108,10 @@ _sp_256_mod_inv_avx2_4:
         testb	$0x01, %r10b
         jnz	L_256_mod_inv_avx2_4_v_even_end
 L_256_mod_inv_avx2_4_v_even_start:
-        shrdq	$0x01, %r11, %r10
-        shrdq	$0x01, %r12, %r11
-        shrdq	$0x01, %r13, %r12
-        shrq	$0x01, %r13
+        shrdq	$1, %r11, %r10
+        shrdq	$1, %r12, %r11
+        shrdq	$1, %r13, %r12
+        shrq	$1, %r13
         vptest	%ymm8, %ymm2
         jz	L_256_mod_inv_avx2_4_v_even_shr1
         vpaddd	%ymm6, %ymm2, %ymm2
@@ -59119,10 +59154,10 @@ L_256_mod_inv_avx2_4_uv_u:
         vpaddd	%ymm7, %ymm1, %ymm1
 L_256_mod_inv_avx2_4_usubv_done_neg:
 L_256_mod_inv_avx2_4_usubv_shr1:
-        shrdq	$0x01, %rcx, %rax
-        shrdq	$0x01, %r8, %rcx
-        shrdq	$0x01, %r9, %r8
-        shrq	$0x01, %r9
+        shrdq	$1, %rcx, %rax
+        shrdq	$1, %r8, %rcx
+        shrdq	$1, %r9, %r8
+        shrq	$1, %r9
         vptest	%ymm8, %ymm0
         jz	L_256_mod_inv_avx2_4_usubv_sub_shr1
         vpaddd	%ymm6, %ymm0, %ymm0
@@ -59172,10 +59207,10 @@ L_256_mod_inv_avx2_4_uv_v:
         vpaddd	%ymm7, %ymm3, %ymm3
 L_256_mod_inv_avx2_4_vsubu_done_neg:
 L_256_mod_inv_avx2_4_vsubu_shr1:
-        shrdq	$0x01, %r11, %r10
-        shrdq	$0x01, %r12, %r11
-        shrdq	$0x01, %r13, %r12
-        shrq	$0x01, %r13
+        shrdq	$1, %r11, %r10
+        shrdq	$1, %r12, %r11
+        shrdq	$1, %r13, %r12
+        shrq	$1, %r13
         vptest	%ymm8, %ymm2
         jz	L_256_mod_inv_avx2_4_vsubu_sub_shr1
         vpaddd	%ymm6, %ymm2, %ymm2
@@ -59268,7 +59303,7 @@ L_256_mod_inv_avx2_4_store_done:
         adcq	%r13, %r12
         movslq	%r14d, %r14
         adcq	%r15, %r14
-        jge	L_256_mod_inv_avx2_4_3_no_add_order
+        jge	L_256_mod_inv_avx2_4_no_add_order
         movq	$0x9cac2fc632551, %rcx
         movq	$0xada7179e84f3b, %r9
         movq	$0xfffffffbce6fa, %r11
@@ -59296,7 +59331,7 @@ L_256_mod_inv_avx2_4_store_done:
         andq	%rsi, %r12
         sarq	$52, %r13
         addq	%r13, %r14
-L_256_mod_inv_avx2_4_3_no_add_order:
+L_256_mod_inv_avx2_4_no_add_order:
         movq	%r8, %rcx
         movq	%r10, %r9
         movq	%r12, %r11
@@ -61148,21 +61183,21 @@ _sp_384_mont_div2_6:
         adcq	$0x00, %r10
         movq	(%rsp), %rax
         movq	8(%rsp), %rcx
-        shrdq	$0x01, %rcx, %rax
+        shrdq	$1, %rcx, %rax
         movq	%rax, (%rdi)
         movq	16(%rsp), %rax
-        shrdq	$0x01, %rax, %rcx
+        shrdq	$1, %rax, %rcx
         movq	%rcx, 8(%rdi)
         movq	24(%rsp), %rcx
-        shrdq	$0x01, %rcx, %rax
+        shrdq	$1, %rcx, %rax
         movq	%rax, 16(%rdi)
         movq	32(%rsp), %rax
-        shrdq	$0x01, %rax, %rcx
+        shrdq	$1, %rax, %rcx
         movq	%rcx, 24(%rdi)
         movq	40(%rsp), %rcx
-        shrdq	$0x01, %rcx, %rax
+        shrdq	$1, %rcx, %rax
         movq	%rax, 32(%rdi)
-        shrdq	$0x01, %r10, %rcx
+        shrdq	$1, %r10, %rcx
         movq	%rcx, 40(%rdi)
         addq	$48, %rsp
         repz retq
@@ -61783,21 +61818,21 @@ _sp_384_mont_div2_avx2_6:
         adcq	$0x00, %r10
         movq	(%rdi), %r8
         movq	8(%rdi), %r9
-        shrdq	$0x01, %r9, %r8
+        shrdq	$1, %r9, %r8
         movq	%r8, (%rdi)
         movq	16(%rdi), %r8
-        shrdq	$0x01, %r8, %r9
+        shrdq	$1, %r8, %r9
         movq	%r9, 8(%rdi)
         movq	24(%rdi), %r9
-        shrdq	$0x01, %r9, %r8
+        shrdq	$1, %r9, %r8
         movq	%r8, 16(%rdi)
         movq	32(%rdi), %r8
-        shrdq	$0x01, %r8, %r9
+        shrdq	$1, %r8, %r9
         movq	%r9, 24(%rdi)
         movq	40(%rdi), %r9
-        shrdq	$0x01, %r9, %r8
+        shrdq	$1, %r9, %r8
         movq	%r8, 32(%rdi)
-        shrdq	$0x01, %r10, %r9
+        shrdq	$1, %r10, %r9
         movq	%r9, 40(%rdi)
         repz retq
 #ifndef __APPLE__
@@ -62565,12 +62600,12 @@ _sp_384_rshift1_6:
         movq	24(%rsi), %r8
         movq	32(%rsi), %r9
         movq	40(%rsi), %r10
-        shrdq	$0x01, %rax, %rdx
-        shrdq	$0x01, %rcx, %rax
-        shrdq	$0x01, %r8, %rcx
-        shrdq	$0x01, %r9, %r8
-        shrdq	$0x01, %r10, %r9
-        shrq	$0x01, %r10
+        shrdq	$1, %rax, %rdx
+        shrdq	$1, %rcx, %rax
+        shrdq	$1, %r8, %rcx
+        shrdq	$1, %r9, %r8
+        shrdq	$1, %r10, %r9
+        shrq	$1, %r10
         movq	%rdx, (%rdi)
         movq	%rax, 8(%rdi)
         movq	%rcx, 16(%rdi)
@@ -62629,12 +62664,12 @@ _sp_384_div2_mod_6:
         movq	$0x00, %rdx
         adcq	$0x00, %rdx
 L_384_mod_inv_6_div2_mod_no_add:
-        shrdq	$0x01, %rcx, %rax
-        shrdq	$0x01, %r8, %rcx
-        shrdq	$0x01, %r9, %r8
-        shrdq	$0x01, %r10, %r9
-        shrdq	$0x01, %r11, %r10
-        shrdq	$0x01, %rdx, %r11
+        shrdq	$1, %rcx, %rax
+        shrdq	$1, %r8, %rcx
+        shrdq	$1, %r9, %r8
+        shrdq	$1, %r10, %r9
+        shrdq	$1, %r11, %r10
+        shrdq	$1, %rdx, %r11
         movq	%rax, (%rdi)
         movq	%rcx, 8(%rdi)
         movq	%r8, 16(%rdi)
@@ -66688,15 +66723,15 @@ _sp_521_mont_div2_9:
         sbbq	$0x00, %r13
         shlq	$9, %r14
         addq	%r14, %r13
-        shrdq	$0x01, %rax, %rdx
-        shrdq	$0x01, %rcx, %rax
-        shrdq	$0x01, %r8, %rcx
-        shrdq	$0x01, %r9, %r8
-        shrdq	$0x01, %r10, %r9
-        shrdq	$0x01, %r11, %r10
-        shrdq	$0x01, %r12, %r11
-        shrdq	$0x01, %r13, %r12
-        shrq	$0x01, %r13
+        shrdq	$1, %rax, %rdx
+        shrdq	$1, %rcx, %rax
+        shrdq	$1, %r8, %rcx
+        shrdq	$1, %r9, %r8
+        shrdq	$1, %r10, %r9
+        shrdq	$1, %r11, %r10
+        shrdq	$1, %r12, %r11
+        shrdq	$1, %r13, %r12
+        shrq	$1, %r13
         movq	%rdx, (%rdi)
         movq	%rax, 8(%rdi)
         movq	%rcx, 16(%rdi)
@@ -68454,15 +68489,15 @@ _sp_521_mont_div2_avx2_9:
         sbbq	$0x00, %r13
         shlq	$9, %r14
         addq	%r14, %r13
-        shrdq	$0x01, %rax, %rdx
-        shrdq	$0x01, %rcx, %rax
-        shrdq	$0x01, %r8, %rcx
-        shrdq	$0x01, %r9, %r8
-        shrdq	$0x01, %r10, %r9
-        shrdq	$0x01, %r11, %r10
-        shrdq	$0x01, %r12, %r11
-        shrdq	$0x01, %r13, %r12
-        shrq	$0x01, %r13
+        shrdq	$1, %rax, %rdx
+        shrdq	$1, %rcx, %rax
+        shrdq	$1, %r8, %rcx
+        shrdq	$1, %r9, %r8
+        shrdq	$1, %r10, %r9
+        shrdq	$1, %r11, %r10
+        shrdq	$1, %r12, %r11
+        shrdq	$1, %r13, %r12
+        shrq	$1, %r13
         movq	%rdx, (%rdi)
         movq	%rax, 8(%rdi)
         movq	%rcx, 16(%rdi)
@@ -69634,10 +69669,10 @@ _sp_521_rshift1_9:
         movq	16(%rsi), %rcx
         movq	24(%rsi), %r8
         movq	32(%rsi), %r10
-        shrdq	$0x01, %rax, %rdx
-        shrdq	$0x01, %rcx, %rax
-        shrdq	$0x01, %r8, %rcx
-        shrdq	$0x01, %r10, %r8
+        shrdq	$1, %rax, %rdx
+        shrdq	$1, %rcx, %rax
+        shrdq	$1, %r8, %rcx
+        shrdq	$1, %r10, %r8
         movq	%rdx, (%rdi)
         movq	%rax, 8(%rdi)
         movq	%rcx, 16(%rdi)
@@ -69646,15 +69681,15 @@ _sp_521_rshift1_9:
         movq	48(%rsi), %rcx
         movq	56(%rsi), %r8
         movq	64(%rsi), %rdx
-        shrdq	$0x01, %rax, %r10
-        shrdq	$0x01, %rcx, %rax
-        shrdq	$0x01, %r8, %rcx
-        shrdq	$0x01, %rdx, %r8
+        shrdq	$1, %rax, %r10
+        shrdq	$1, %rcx, %rax
+        shrdq	$1, %r8, %rcx
+        shrdq	$1, %rdx, %r8
         movq	%r10, 32(%rdi)
         movq	%rax, 40(%rdi)
         movq	%rcx, 48(%rdi)
         movq	%r8, 56(%rdi)
-        shrq	$0x01, %rdx
+        shrq	$1, %rdx
         movq	%rdx, 64(%rdi)
         repz retq
 #ifndef __APPLE__
@@ -69723,10 +69758,10 @@ L_521_mod_inv_9_div2_mod_no_add:
         movq	16(%rsi), %r8
         movq	24(%rsi), %r9
         movq	32(%rsi), %r10
-        shrdq	$0x01, %rcx, %rax
-        shrdq	$0x01, %r8, %rcx
-        shrdq	$0x01, %r9, %r8
-        shrdq	$0x01, %r10, %r9
+        shrdq	$1, %rcx, %rax
+        shrdq	$1, %r8, %rcx
+        shrdq	$1, %r9, %r8
+        shrdq	$1, %r10, %r9
         movq	%rax, (%rdi)
         movq	%rcx, 8(%rdi)
         movq	%r8, 16(%rdi)
@@ -69735,15 +69770,15 @@ L_521_mod_inv_9_div2_mod_no_add:
         movq	48(%rsi), %r8
         movq	56(%rsi), %r9
         movq	64(%rsi), %rax
-        shrdq	$0x01, %rcx, %r10
-        shrdq	$0x01, %r8, %rcx
-        shrdq	$0x01, %r9, %r8
-        shrdq	$0x01, %rax, %r9
+        shrdq	$1, %rcx, %r10
+        shrdq	$1, %r8, %rcx
+        shrdq	$1, %r9, %r8
+        shrdq	$1, %rax, %r9
         movq	%r10, 32(%rdi)
         movq	%rcx, 40(%rdi)
         movq	%r8, 48(%rdi)
         movq	%r9, 56(%rdi)
-        shrq	$0x01, %rax
+        shrq	$1, %rax
         movq	%rax, 64(%rdi)
         repz retq
 #ifndef __APPLE__
@@ -77428,51 +77463,51 @@ _sp_1024_mont_div2_16:
         adcq	$0x00, %r10
         movq	(%rsp), %rax
         movq	8(%rsp), %rcx
-        shrdq	$0x01, %rcx, %rax
+        shrdq	$1, %rcx, %rax
         movq	%rax, (%rdi)
         movq	16(%rsp), %rax
-        shrdq	$0x01, %rax, %rcx
+        shrdq	$1, %rax, %rcx
         movq	%rcx, 8(%rdi)
         movq	24(%rsp), %rcx
-        shrdq	$0x01, %rcx, %rax
+        shrdq	$1, %rcx, %rax
         movq	%rax, 16(%rdi)
         movq	32(%rsp), %rax
-        shrdq	$0x01, %rax, %rcx
+        shrdq	$1, %rax, %rcx
         movq	%rcx, 24(%rdi)
         movq	40(%rsp), %rcx
-        shrdq	$0x01, %rcx, %rax
+        shrdq	$1, %rcx, %rax
         movq	%rax, 32(%rdi)
         movq	48(%rsp), %rax
-        shrdq	$0x01, %rax, %rcx
+        shrdq	$1, %rax, %rcx
         movq	%rcx, 40(%rdi)
         movq	56(%rsp), %rcx
-        shrdq	$0x01, %rcx, %rax
+        shrdq	$1, %rcx, %rax
         movq	%rax, 48(%rdi)
         movq	64(%rsp), %rax
-        shrdq	$0x01, %rax, %rcx
+        shrdq	$1, %rax, %rcx
         movq	%rcx, 56(%rdi)
         movq	72(%rsp), %rcx
-        shrdq	$0x01, %rcx, %rax
+        shrdq	$1, %rcx, %rax
         movq	%rax, 64(%rdi)
         movq	80(%rsp), %rax
-        shrdq	$0x01, %rax, %rcx
+        shrdq	$1, %rax, %rcx
         movq	%rcx, 72(%rdi)
         movq	88(%rsp), %rcx
-        shrdq	$0x01, %rcx, %rax
+        shrdq	$1, %rcx, %rax
         movq	%rax, 80(%rdi)
         movq	96(%rsp), %rax
-        shrdq	$0x01, %rax, %rcx
+        shrdq	$1, %rax, %rcx
         movq	%rcx, 88(%rdi)
         movq	104(%rsp), %rcx
-        shrdq	$0x01, %rcx, %rax
+        shrdq	$1, %rcx, %rax
         movq	%rax, 96(%rdi)
         movq	112(%rsp), %rax
-        shrdq	$0x01, %rax, %rcx
+        shrdq	$1, %rax, %rcx
         movq	%rcx, 104(%rdi)
         movq	120(%rsp), %rcx
-        shrdq	$0x01, %rcx, %rax
+        shrdq	$1, %rcx, %rax
         movq	%rax, 112(%rdi)
-        shrdq	$0x01, %r10, %rcx
+        shrdq	$1, %r10, %rcx
         movq	%rcx, 120(%rdi)
         addq	$0x80, %rsp
         repz retq
@@ -78692,51 +78727,51 @@ _sp_1024_mont_div2_avx2_16:
         adcq	$0x00, %r10
         movq	(%rdi), %r8
         movq	8(%rdi), %r9
-        shrdq	$0x01, %r9, %r8
+        shrdq	$1, %r9, %r8
         movq	%r8, (%rdi)
         movq	16(%rdi), %r8
-        shrdq	$0x01, %r8, %r9
+        shrdq	$1, %r8, %r9
         movq	%r9, 8(%rdi)
         movq	24(%rdi), %r9
-        shrdq	$0x01, %r9, %r8
+        shrdq	$1, %r9, %r8
         movq	%r8, 16(%rdi)
         movq	32(%rdi), %r8
-        shrdq	$0x01, %r8, %r9
+        shrdq	$1, %r8, %r9
         movq	%r9, 24(%rdi)
         movq	40(%rdi), %r9
-        shrdq	$0x01, %r9, %r8
+        shrdq	$1, %r9, %r8
         movq	%r8, 32(%rdi)
         movq	48(%rdi), %r8
-        shrdq	$0x01, %r8, %r9
+        shrdq	$1, %r8, %r9
         movq	%r9, 40(%rdi)
         movq	56(%rdi), %r9
-        shrdq	$0x01, %r9, %r8
+        shrdq	$1, %r9, %r8
         movq	%r8, 48(%rdi)
         movq	64(%rdi), %r8
-        shrdq	$0x01, %r8, %r9
+        shrdq	$1, %r8, %r9
         movq	%r9, 56(%rdi)
         movq	72(%rdi), %r9
-        shrdq	$0x01, %r9, %r8
+        shrdq	$1, %r9, %r8
         movq	%r8, 64(%rdi)
         movq	80(%rdi), %r8
-        shrdq	$0x01, %r8, %r9
+        shrdq	$1, %r8, %r9
         movq	%r9, 72(%rdi)
         movq	88(%rdi), %r9
-        shrdq	$0x01, %r9, %r8
+        shrdq	$1, %r9, %r8
         movq	%r8, 80(%rdi)
         movq	96(%rdi), %r8
-        shrdq	$0x01, %r8, %r9
+        shrdq	$1, %r8, %r9
         movq	%r9, 88(%rdi)
         movq	104(%rdi), %r9
-        shrdq	$0x01, %r9, %r8
+        shrdq	$1, %r9, %r8
         movq	%r8, 96(%rdi)
         movq	112(%rdi), %r8
-        shrdq	$0x01, %r8, %r9
+        shrdq	$1, %r8, %r9
         movq	%r9, 104(%rdi)
         movq	120(%rdi), %r9
-        shrdq	$0x01, %r9, %r8
+        shrdq	$1, %r9, %r8
         movq	%r8, 112(%rdi)
-        shrdq	$0x01, %r10, %r9
+        shrdq	$1, %r10, %r9
         movq	%r9, 120(%rdi)
         repz retq
 #ifndef __APPLE__
diff --git a/wolfcrypt/src/sp_x86_64_asm.asm b/wolfcrypt/src/sp_x86_64_asm.asm
index c91ccfa48b5..cdf57894078 100644
--- a/wolfcrypt/src/sp_x86_64_asm.asm
+++ b/wolfcrypt/src/sp_x86_64_asm.asm
@@ -18,6 +18,7 @@
 ;  * along with this program; if not, write to the Free Software
 ;  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
 ;  */
+
 IF @Version LT 1200
 ; AVX2 instructions not recognized by old versions of MASM
 IFNDEF NO_AVX2_SUPPORT
@@ -50,7 +51,7 @@ IFNDEF WOLFSSL_SP_NO_2048
 ;  * a  Byte array.
 ;  * n  Number of bytes in array to read.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_2048_from_bin_bswap PROC
         push	r12
         push	r13
@@ -128,7 +129,7 @@ L_2048_from_bin_bswap_zero_end:
         pop	r12
         ret
 sp_2048_from_bin_bswap ENDP
-_text ENDS
+_TEXT ENDS
 IFNDEF NO_MOVBE_SUPPORT
 ; /* Read big endian unsigned byte array into r.
 ;  * Uses the movbe instruction which is an optional instruction.
@@ -138,7 +139,7 @@ IFNDEF NO_MOVBE_SUPPORT
 ;  * a  Byte array.
 ;  * n  Number of bytes in array to read.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_2048_from_bin_movbe PROC
         push	r12
         mov	r11, r8
@@ -204,7 +205,7 @@ L_2048_from_bin_movbe_zero_end:
         pop	r12
         ret
 sp_2048_from_bin_movbe ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 ; /* Write r as big endian to byte array.
 ;  * Fixed length number of bytes written: 256
@@ -213,7 +214,7 @@ ENDIF
 ;  * r  A single precision integer.
 ;  * a  Byte array.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_2048_to_bin_bswap_32 PROC
         mov	rax, QWORD PTR [rcx+248]
         mov	r8, QWORD PTR [rcx+240]
@@ -313,7 +314,7 @@ sp_2048_to_bin_bswap_32 PROC
         mov	QWORD PTR [rdx+248], r8
         ret
 sp_2048_to_bin_bswap_32 ENDP
-_text ENDS
+_TEXT ENDS
 IFNDEF NO_MOVBE_SUPPORT
 ; /* Write r as big endian to byte array.
 ;  * Fixed length number of bytes written: 256
@@ -322,7 +323,7 @@ IFNDEF NO_MOVBE_SUPPORT
 ;  * r  A single precision integer.
 ;  * a  Byte array.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_2048_to_bin_movbe_32 PROC
         movbe	rax, QWORD PTR [rcx+248]
         movbe	r8, QWORD PTR [rcx+240]
@@ -390,7 +391,7 @@ sp_2048_to_bin_movbe_32 PROC
         mov	QWORD PTR [rdx+248], r8
         ret
 sp_2048_to_bin_movbe_32 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 ; /* Multiply a and b into r. (r = a * b)
 ;  *
@@ -398,7 +399,7 @@ ENDIF
 ;  * a  A single precision integer.
 ;  * b  A single precision integer.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_2048_mul_16 PROC
         push	r12
         mov	r9, rdx
@@ -2034,7 +2035,7 @@ sp_2048_mul_16 PROC
         pop	r12
         ret
 sp_2048_mul_16 ENDP
-_text ENDS
+_TEXT ENDS
 IFDEF HAVE_INTEL_AVX2
 ; /* Multiply a and b into r. (r = a * b)
 ;  *
@@ -2042,7 +2043,7 @@ IFDEF HAVE_INTEL_AVX2
 ;  * a   First number to multiply.
 ;  * b   Second number to multiply.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_2048_mul_avx2_16 PROC
         push	rbx
         push	rbp
@@ -3705,7 +3706,7 @@ L_end_2048_mul_avx2_16:
         pop	rbx
         ret
 sp_2048_mul_avx2_16 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 ; /* Add b to a into r. (r = a + b)
 ;  *
@@ -3713,7 +3714,7 @@ ENDIF
 ;  * a  A single precision integer.
 ;  * b  A single precision integer.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_2048_add_16 PROC
         ; Add
         mov	r9, QWORD PTR [rdx]
@@ -3768,13 +3769,13 @@ sp_2048_add_16 PROC
         adc	rax, 0
         ret
 sp_2048_add_16 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Sub b from a into a. (a -= b)
 ;  *
 ;  * a  A single precision integer and result.
 ;  * b  A single precision integer.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_2048_sub_in_place_32 PROC
         mov	r8, QWORD PTR [rcx]
         sub	r8, QWORD PTR [rdx]
@@ -3875,14 +3876,14 @@ sp_2048_sub_in_place_32 PROC
         sbb	rax, rax
         ret
 sp_2048_sub_in_place_32 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Add b to a into r. (r = a + b)
 ;  *
 ;  * r  A single precision integer.
 ;  * a  A single precision integer.
 ;  * b  A single precision integer.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_2048_add_32 PROC
         ; Add
         mov	r9, QWORD PTR [rdx]
@@ -3985,14 +3986,14 @@ sp_2048_add_32 PROC
         adc	rax, 0
         ret
 sp_2048_add_32 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Multiply a and b into r. (r = a * b)
 ;  *
 ;  * r  A single precision integer.
 ;  * a  A single precision integer.
 ;  * b  A single precision integer.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_2048_mul_32 PROC
         push	r12
         push	r13
@@ -4690,7 +4691,7 @@ ENDIF
         pop	r12
         ret
 sp_2048_mul_32 ENDP
-_text ENDS
+_TEXT ENDS
 IFDEF HAVE_INTEL_AVX2
 ; /* Multiply a and b into r. (r = a * b)
 ;  *
@@ -4698,7 +4699,7 @@ IFDEF HAVE_INTEL_AVX2
 ;  * a  A single precision integer.
 ;  * b  A single precision integer.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_2048_mul_avx2_32 PROC
         push	r12
         push	r13
@@ -5348,14 +5349,14 @@ ENDIF
         pop	r12
         ret
 sp_2048_mul_avx2_32 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 ; /* Square a and put result in r. (r = a * a)
 ;  *
 ;  * r  A single precision integer.
 ;  * a  A single precision integer.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_2048_sqr_16 PROC
         push	r12
         push	r13
@@ -6437,14 +6438,14 @@ sp_2048_sqr_16 PROC
         pop	r12
         ret
 sp_2048_sqr_16 ENDP
-_text ENDS
+_TEXT ENDS
 IFDEF HAVE_INTEL_AVX2
 ; /* Square a and put result in r. (r = a * a)
 ;  *
 ;  * r  A single precision integer.
 ;  * a  A single precision integer.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_2048_sqr_avx2_16 PROC
         push	rbp
         push	r12
@@ -7490,7 +7491,7 @@ L_end_2048_sqr_avx2_16:
         pop	rbp
         ret
 sp_2048_sqr_avx2_16 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 ; /* Square a and put result in r. (r = a * a)
 ;  *
@@ -7499,7 +7500,7 @@ ENDIF
 ;  * r  A single precision integer.
 ;  * a  A single precision integer.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_2048_sqr_32 PROC
         sub	rsp, 272
         mov	QWORD PTR [rsp+256], rcx
@@ -8008,7 +8009,7 @@ ENDIF
         add	rsp, 272
         ret
 sp_2048_sqr_32 ENDP
-_text ENDS
+_TEXT ENDS
 IFDEF HAVE_INTEL_AVX2
 ; /* Square a and put result in r. (r = a * a)
 ;  *
@@ -8017,7 +8018,7 @@ IFDEF HAVE_INTEL_AVX2
 ;  * r  A single precision integer.
 ;  * a  A single precision integer.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_2048_sqr_avx2_32 PROC
         sub	rsp, 272
         mov	QWORD PTR [rsp+256], rcx
@@ -8526,14 +8527,14 @@ ENDIF
         add	rsp, 272
         ret
 sp_2048_sqr_avx2_32 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 ; /* Sub b from a into a. (a -= b)
 ;  *
 ;  * a  A single precision integer and result.
 ;  * b  A single precision integer.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_2048_sub_in_place_16 PROC
         mov	r8, QWORD PTR [rcx]
         sub	r8, QWORD PTR [rdx]
@@ -8586,14 +8587,14 @@ sp_2048_sub_in_place_16 PROC
         sbb	rax, rax
         ret
 sp_2048_sub_in_place_16 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Mul a by digit b into r. (r = a * b)
 ;  *
 ;  * r  A single precision integer.
 ;  * a  A single precision integer.
 ;  * b  A single precision digit.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_2048_mul_d_32 PROC
         push	r12
         mov	r9, rdx
@@ -8854,7 +8855,7 @@ sp_2048_mul_d_32 PROC
         pop	r12
         ret
 sp_2048_mul_d_32 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Conditionally subtract b from a using the mask m.
 ;  * m is -1 to subtract and 0 when not copying.
 ;  *
@@ -8863,7 +8864,7 @@ _text ENDS
 ;  * b  A single precision number to subtract.
 ;  * m  Mask value to apply.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_2048_cond_sub_16 PROC
         sub	rsp, 128
         mov	r10, QWORD PTR [r8]
@@ -8982,14 +8983,14 @@ sp_2048_cond_sub_16 PROC
         add	rsp, 128
         ret
 sp_2048_cond_sub_16 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Reduce the number back to 2048 bits using Montgomery reduction.
 ;  *
 ;  * a   A single precision number to reduce in place.
 ;  * m   The single precision number representing the modulus.
 ;  * mp  The digit representing the negative inverse of m mod 2^n.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_2048_mont_reduce_16 PROC
         push	r12
         push	r13
@@ -9189,7 +9190,7 @@ ENDIF
         pop	r12
         ret
 sp_2048_mont_reduce_16 ENDP
-_text ENDS
+_TEXT ENDS
 IFDEF HAVE_INTEL_AVX2
 ; /* Conditionally subtract b from a using the mask m.
 ;  * m is -1 to subtract and 0 when not copying.
@@ -9199,7 +9200,7 @@ IFDEF HAVE_INTEL_AVX2
 ;  * b  A single precision number to subtract.
 ;  * m  Mask value to apply.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_2048_cond_sub_avx2_16 PROC
         push	r12
         mov	r12, QWORD PTR [r8]
@@ -9286,7 +9287,7 @@ sp_2048_cond_sub_avx2_16 PROC
         pop	r12
         ret
 sp_2048_cond_sub_avx2_16 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 ; /* Mul a by digit b into r. (r = a * b)
 ;  *
@@ -9294,7 +9295,7 @@ ENDIF
 ;  * a  A single precision integer.
 ;  * b  A single precision digit.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_2048_mul_d_16 PROC
         push	r12
         mov	r9, rdx
@@ -9427,7 +9428,7 @@ sp_2048_mul_d_16 PROC
         pop	r12
         ret
 sp_2048_mul_d_16 ENDP
-_text ENDS
+_TEXT ENDS
 IFDEF HAVE_INTEL_AVX2
 ; /* Mul a by digit b into r. (r = a * b)
 ;  *
@@ -9435,7 +9436,7 @@ IFDEF HAVE_INTEL_AVX2
 ;  * a  A single precision integer.
 ;  * b  A single precision digit.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_2048_mul_d_avx2_16 PROC
         push	r12
         push	r13
@@ -9541,7 +9542,7 @@ sp_2048_mul_d_avx2_16 PROC
         pop	r12
         ret
 sp_2048_mul_d_avx2_16 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 IFDEF _WIN64
 ; /* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div)
@@ -9551,7 +9552,7 @@ IFDEF _WIN64
 ;  * div  The dividend.
 ;  * returns the result of the division.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 div_2048_word_asm_16 PROC
         mov	r9, rdx
         mov	rax, r9
@@ -9559,7 +9560,7 @@ div_2048_word_asm_16 PROC
         div	r8
         ret
 div_2048_word_asm_16 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 ; /* Compare a with b in constant time.
 ;  *
@@ -9568,7 +9569,7 @@ ENDIF
 ;  * return -ve, 0 or +ve if a is less than, equal to or greater than b
 ;  * respectively.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_2048_cmp_16 PROC
         push	r12
         xor	r9, r9
@@ -9707,9 +9708,9 @@ sp_2048_cmp_16 PROC
         pop	r12
         ret
 sp_2048_cmp_16 ENDP
-_text ENDS
+_TEXT ENDS
 IFNDEF WC_NO_CACHE_RESISTANT
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_2048_get_from_table_16 PROC
         sub	rsp, 128
         movdqu	OWORD PTR [rsp], xmm6
@@ -10880,7 +10881,7 @@ sp_2048_get_from_table_16 PROC
         add	rsp, 128
         ret
 sp_2048_get_from_table_16 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 IFDEF HAVE_INTEL_AVX2
 ; /* Reduce the number back to 2048 bits using Montgomery reduction.
@@ -10889,7 +10890,7 @@ IFDEF HAVE_INTEL_AVX2
 ;  * m   The single precision number representing the modulus.
 ;  * mp  The digit representing the negative inverse of m mod 2^n.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_2048_mont_reduce_avx2_16 PROC
         push	r12
         push	r13
@@ -11212,10 +11213,10 @@ L_2048_mont_reduce_avx2_16_loop:
         pop	r12
         ret
 sp_2048_mont_reduce_avx2_16 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 IFNDEF WC_NO_CACHE_RESISTANT
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_2048_get_from_table_avx2_16 PROC
         sub	rsp, 128
         vmovdqu	OWORD PTR [rsp], xmm6
@@ -11766,7 +11767,7 @@ sp_2048_get_from_table_avx2_16 PROC
         add	rsp, 128
         ret
 sp_2048_get_from_table_avx2_16 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 ; /* Conditionally subtract b from a using the mask m.
 ;  * m is -1 to subtract and 0 when not copying.
@@ -11776,7 +11777,7 @@ ENDIF
 ;  * b  A single precision number to subtract.
 ;  * m  Mask value to apply.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_2048_cond_sub_32 PROC
         sub	rsp, 256
         mov	r10, QWORD PTR [r8]
@@ -12007,14 +12008,14 @@ sp_2048_cond_sub_32 PROC
         add	rsp, 256
         ret
 sp_2048_cond_sub_32 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Reduce the number back to 2048 bits using Montgomery reduction.
 ;  *
 ;  * a   A single precision number to reduce in place.
 ;  * m   The single precision number representing the modulus.
 ;  * mp  The digit representing the negative inverse of m mod 2^n.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_2048_mont_reduce_32 PROC
         push	r12
         push	r13
@@ -12374,14 +12375,14 @@ ENDIF
         pop	r12
         ret
 sp_2048_mont_reduce_32 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Sub b from a into r. (r = a - b)
 ;  *
 ;  * r  A single precision integer.
 ;  * a  A single precision integer.
 ;  * b  A single precision integer.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_2048_sub_32 PROC
         mov	r9, QWORD PTR [rdx]
         sub	r9, QWORD PTR [r8]
@@ -12482,7 +12483,7 @@ sp_2048_sub_32 PROC
         sbb	rax, rax
         ret
 sp_2048_sub_32 ENDP
-_text ENDS
+_TEXT ENDS
 IFDEF HAVE_INTEL_AVX2
 ; /* Mul a by digit b into r. (r = a * b)
 ;  *
@@ -12490,7 +12491,7 @@ IFDEF HAVE_INTEL_AVX2
 ;  * a  A single precision integer.
 ;  * b  A single precision digit.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_2048_mul_d_avx2_32 PROC
         push	r12
         push	r13
@@ -12692,7 +12693,7 @@ sp_2048_mul_d_avx2_32 PROC
         pop	r12
         ret
 sp_2048_mul_d_avx2_32 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 IFDEF _WIN64
 ; /* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div)
@@ -12702,7 +12703,7 @@ IFDEF _WIN64
 ;  * div  The dividend.
 ;  * returns the result of the division.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 div_2048_word_asm_32 PROC
         mov	r9, rdx
         mov	rax, r9
@@ -12710,7 +12711,7 @@ div_2048_word_asm_32 PROC
         div	r8
         ret
 div_2048_word_asm_32 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 IFDEF HAVE_INTEL_AVX2
 ; /* Conditionally subtract b from a using the mask m.
@@ -12721,7 +12722,7 @@ IFDEF HAVE_INTEL_AVX2
 ;  * b  A single precision number to subtract.
 ;  * m  Mask value to apply.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_2048_cond_sub_avx2_32 PROC
         push	r12
         mov	r12, QWORD PTR [r8]
@@ -12888,7 +12889,7 @@ sp_2048_cond_sub_avx2_32 PROC
         pop	r12
         ret
 sp_2048_cond_sub_avx2_32 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 ; /* Compare a with b in constant time.
 ;  *
@@ -12897,7 +12898,7 @@ ENDIF
 ;  * return -ve, 0 or +ve if a is less than, equal to or greater than b
 ;  * respectively.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_2048_cmp_32 PROC
         push	r12
         xor	r9, r9
@@ -13164,9 +13165,9 @@ sp_2048_cmp_32 PROC
         pop	r12
         ret
 sp_2048_cmp_32 ENDP
-_text ENDS
+_TEXT ENDS
 IFNDEF WC_NO_CACHE_RESISTANT
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_2048_get_from_table_32 PROC
         sub	rsp, 128
         movdqu	OWORD PTR [rsp], xmm6
@@ -17785,7 +17786,7 @@ sp_2048_get_from_table_32 PROC
         add	rsp, 128
         ret
 sp_2048_get_from_table_32 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 IFDEF HAVE_INTEL_AVX2
 ; /* Reduce the number back to 2048 bits using Montgomery reduction.
@@ -17794,7 +17795,7 @@ IFDEF HAVE_INTEL_AVX2
 ;  * m   The single precision number representing the modulus.
 ;  * mp  The digit representing the negative inverse of m mod 2^n.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_2048_mont_reduce_avx2_32 PROC
         push	r12
         push	r13
@@ -18192,10 +18193,10 @@ L_2048_mont_reduce_avx2_32_loop:
         pop	r12
         ret
 sp_2048_mont_reduce_avx2_32 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 IFNDEF WC_NO_CACHE_RESISTANT
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_2048_get_from_table_avx2_32 PROC
         sub	rsp, 128
         vmovdqu	OWORD PTR [rsp], xmm6
@@ -20358,7 +20359,7 @@ sp_2048_get_from_table_avx2_32 PROC
         add	rsp, 128
         ret
 sp_2048_get_from_table_avx2_32 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 ; /* Conditionally add a and b using the mask m.
 ;  * m is -1 to add and 0 when not.
@@ -20368,7 +20369,7 @@ ENDIF
 ;  * b  A single precision number to add.
 ;  * m  Mask value to apply.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_2048_cond_add_16 PROC
         sub	rsp, 128
         mov	rax, 0
@@ -20488,7 +20489,7 @@ sp_2048_cond_add_16 PROC
         add	rsp, 128
         ret
 sp_2048_cond_add_16 ENDP
-_text ENDS
+_TEXT ENDS
 IFDEF HAVE_INTEL_AVX2
 ; /* Conditionally add a and b using the mask m.
 ;  * m is -1 to add and 0 when not.
@@ -20498,7 +20499,7 @@ IFDEF HAVE_INTEL_AVX2
 ;  * b  A single precision number to add.
 ;  * m  Mask value to apply.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_2048_cond_add_avx2_16 PROC
         push	r12
         mov	rax, 0
@@ -20586,7 +20587,7 @@ sp_2048_cond_add_avx2_16 PROC
         pop	r12
         ret
 sp_2048_cond_add_avx2_16 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 ; /* Shift number left by n bit. (r = a << n)
 ;  *
@@ -20594,7 +20595,7 @@ ENDIF
 ;  * a  Number to shift.
 ;  * n  Amoutnt o shift.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_2048_lshift_32 PROC
         push	r12
         push	r13
@@ -20703,7 +20704,7 @@ sp_2048_lshift_32 PROC
         pop	r12
         ret
 sp_2048_lshift_32 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 ENDIF
 IFNDEF WOLFSSL_SP_NO_3072
@@ -20716,7 +20717,7 @@ IFNDEF WOLFSSL_SP_NO_3072
 ;  * a  Byte array.
 ;  * n  Number of bytes in array to read.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_3072_from_bin_bswap PROC
         push	r12
         push	r13
@@ -20794,7 +20795,7 @@ L_3072_from_bin_bswap_zero_end:
         pop	r12
         ret
 sp_3072_from_bin_bswap ENDP
-_text ENDS
+_TEXT ENDS
 IFNDEF NO_MOVBE_SUPPORT
 ; /* Read big endian unsigned byte array into r.
 ;  * Uses the movbe instruction which is an optional instruction.
@@ -20804,7 +20805,7 @@ IFNDEF NO_MOVBE_SUPPORT
 ;  * a  Byte array.
 ;  * n  Number of bytes in array to read.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_3072_from_bin_movbe PROC
         push	r12
         mov	r11, r8
@@ -20870,7 +20871,7 @@ L_3072_from_bin_movbe_zero_end:
         pop	r12
         ret
 sp_3072_from_bin_movbe ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 ; /* Write r as big endian to byte array.
 ;  * Fixed length number of bytes written: 384
@@ -20879,7 +20880,7 @@ ENDIF
 ;  * r  A single precision integer.
 ;  * a  Byte array.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_3072_to_bin_bswap_48 PROC
         mov	rax, QWORD PTR [rcx+376]
         mov	r8, QWORD PTR [rcx+368]
@@ -21027,7 +21028,7 @@ sp_3072_to_bin_bswap_48 PROC
         mov	QWORD PTR [rdx+376], r8
         ret
 sp_3072_to_bin_bswap_48 ENDP
-_text ENDS
+_TEXT ENDS
 IFNDEF NO_MOVBE_SUPPORT
 ; /* Write r as big endian to byte array.
 ;  * Fixed length number of bytes written: 384
@@ -21036,7 +21037,7 @@ IFNDEF NO_MOVBE_SUPPORT
 ;  * r  A single precision integer.
 ;  * a  Byte array.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_3072_to_bin_movbe_48 PROC
         movbe	rax, QWORD PTR [rcx+376]
         movbe	r8, QWORD PTR [rcx+368]
@@ -21136,7 +21137,7 @@ sp_3072_to_bin_movbe_48 PROC
         mov	QWORD PTR [rdx+376], r8
         ret
 sp_3072_to_bin_movbe_48 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 ; /* Multiply a and b into r. (r = a * b)
 ;  *
@@ -21144,7 +21145,7 @@ ENDIF
 ;  * a  A single precision integer.
 ;  * b  A single precision integer.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_3072_mul_12 PROC
         push	r12
         mov	r9, rdx
@@ -22084,7 +22085,7 @@ sp_3072_mul_12 PROC
         pop	r12
         ret
 sp_3072_mul_12 ENDP
-_text ENDS
+_TEXT ENDS
 IFDEF HAVE_INTEL_AVX2
 ; /* Multiply a and b into r. (r = a * b)
 ;  *
@@ -22092,7 +22093,7 @@ IFDEF HAVE_INTEL_AVX2
 ;  * a   First number to multiply.
 ;  * b   Second number to multiply.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_3072_mul_avx2_12 PROC
         push	rbx
         push	rbp
@@ -23055,7 +23056,7 @@ L_end_3072_mul_avx2_12:
         pop	rbx
         ret
 sp_3072_mul_avx2_12 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 ; /* Add b to a into r. (r = a + b)
 ;  *
@@ -23063,7 +23064,7 @@ ENDIF
 ;  * a  A single precision integer.
 ;  * b  A single precision integer.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_3072_add_12 PROC
         ; Add
         mov	r9, QWORD PTR [rdx]
@@ -23106,13 +23107,13 @@ sp_3072_add_12 PROC
         adc	rax, 0
         ret
 sp_3072_add_12 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Sub b from a into a. (a -= b)
 ;  *
 ;  * a  A single precision integer and result.
 ;  * b  A single precision integer.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_3072_sub_in_place_24 PROC
         mov	r8, QWORD PTR [rcx]
         sub	r8, QWORD PTR [rdx]
@@ -23189,14 +23190,14 @@ sp_3072_sub_in_place_24 PROC
         sbb	rax, rax
         ret
 sp_3072_sub_in_place_24 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Add b to a into r. (r = a + b)
 ;  *
 ;  * r  A single precision integer.
 ;  * a  A single precision integer.
 ;  * b  A single precision integer.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_3072_add_24 PROC
         ; Add
         mov	r9, QWORD PTR [rdx]
@@ -23275,14 +23276,14 @@ sp_3072_add_24 PROC
         adc	rax, 0
         ret
 sp_3072_add_24 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Multiply a and b into r. (r = a * b)
 ;  *
 ;  * r  A single precision integer.
 ;  * a  A single precision integer.
 ;  * b  A single precision integer.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_3072_mul_24 PROC
         push	r12
         push	r13
@@ -23824,7 +23825,7 @@ ENDIF
         pop	r12
         ret
 sp_3072_mul_24 ENDP
-_text ENDS
+_TEXT ENDS
 IFDEF HAVE_INTEL_AVX2
 ; /* Multiply a and b into r. (r = a * b)
 ;  *
@@ -23832,7 +23833,7 @@ IFDEF HAVE_INTEL_AVX2
 ;  * a  A single precision integer.
 ;  * b  A single precision integer.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_3072_mul_avx2_24 PROC
         push	r12
         push	r13
@@ -24338,14 +24339,14 @@ ENDIF
         pop	r12
         ret
 sp_3072_mul_avx2_24 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 ; /* Sub b from a into a. (a -= b)
 ;  *
 ;  * a  A single precision integer and result.
 ;  * b  A single precision integer.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_3072_sub_in_place_48 PROC
         mov	r8, QWORD PTR [rcx]
         sub	r8, QWORD PTR [rdx]
@@ -24494,14 +24495,14 @@ sp_3072_sub_in_place_48 PROC
         sbb	rax, rax
         ret
 sp_3072_sub_in_place_48 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Add b to a into r. (r = a + b)
 ;  *
 ;  * r  A single precision integer.
 ;  * a  A single precision integer.
 ;  * b  A single precision integer.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_3072_add_48 PROC
         ; Add
         mov	r9, QWORD PTR [rdx]
@@ -24652,14 +24653,14 @@ sp_3072_add_48 PROC
         adc	rax, 0
         ret
 sp_3072_add_48 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Multiply a and b into r. (r = a * b)
 ;  *
 ;  * r  A single precision integer.
 ;  * a  A single precision integer.
 ;  * b  A single precision integer.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_3072_mul_48 PROC
         push	r12
         push	r13
@@ -25669,7 +25670,7 @@ ENDIF
         pop	r12
         ret
 sp_3072_mul_48 ENDP
-_text ENDS
+_TEXT ENDS
 IFDEF HAVE_INTEL_AVX2
 ; /* Multiply a and b into r. (r = a * b)
 ;  *
@@ -25677,7 +25678,7 @@ IFDEF HAVE_INTEL_AVX2
 ;  * a  A single precision integer.
 ;  * b  A single precision integer.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_3072_mul_avx2_48 PROC
         push	r12
         push	r13
@@ -26615,14 +26616,14 @@ ENDIF
         pop	r12
         ret
 sp_3072_mul_avx2_48 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 ; /* Square a and put result in r. (r = a * a)
 ;  *
 ;  * r  A single precision integer.
 ;  * a  A single precision integer.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_3072_sqr_12 PROC
         push	r12
         push	r13
@@ -27284,14 +27285,14 @@ sp_3072_sqr_12 PROC
         pop	r12
         ret
 sp_3072_sqr_12 ENDP
-_text ENDS
+_TEXT ENDS
 IFDEF HAVE_INTEL_AVX2
 ; /* Square a and put result in r. (r = a * a)
 ;  *
 ;  * r  A single precision integer.
 ;  * a  A single precision integer.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_3072_sqr_avx2_12 PROC
         push	rbp
         push	r12
@@ -27926,7 +27927,7 @@ L_end_3072_sqr_avx2_12:
         pop	rbp
         ret
 sp_3072_sqr_avx2_12 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 ; /* Square a and put result in r. (r = a * a)
 ;  *
@@ -27935,7 +27936,7 @@ ENDIF
 ;  * r  A single precision integer.
 ;  * a  A single precision integer.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_3072_sqr_24 PROC
         sub	rsp, 208
         mov	QWORD PTR [rsp+192], rcx
@@ -28328,7 +28329,7 @@ ENDIF
         add	rsp, 208
         ret
 sp_3072_sqr_24 ENDP
-_text ENDS
+_TEXT ENDS
 IFDEF HAVE_INTEL_AVX2
 ; /* Square a and put result in r. (r = a * a)
 ;  *
@@ -28337,7 +28338,7 @@ IFDEF HAVE_INTEL_AVX2
 ;  * r  A single precision integer.
 ;  * a  A single precision integer.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_3072_sqr_avx2_24 PROC
         sub	rsp, 208
         mov	QWORD PTR [rsp+192], rcx
@@ -28730,7 +28731,7 @@ ENDIF
         add	rsp, 208
         ret
 sp_3072_sqr_avx2_24 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 ; /* Square a and put result in r. (r = a * a)
 ;  *
@@ -28739,7 +28740,7 @@ ENDIF
 ;  * r  A single precision integer.
 ;  * a  A single precision integer.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_3072_sqr_48 PROC
         sub	rsp, 400
         mov	QWORD PTR [rsp+384], rcx
@@ -29480,7 +29481,7 @@ ENDIF
         add	rsp, 400
         ret
 sp_3072_sqr_48 ENDP
-_text ENDS
+_TEXT ENDS
 IFDEF HAVE_INTEL_AVX2
 ; /* Square a and put result in r. (r = a * a)
 ;  *
@@ -29489,7 +29490,7 @@ IFDEF HAVE_INTEL_AVX2
 ;  * r  A single precision integer.
 ;  * a  A single precision integer.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_3072_sqr_avx2_48 PROC
         sub	rsp, 400
         mov	QWORD PTR [rsp+384], rcx
@@ -30230,7 +30231,7 @@ ENDIF
         add	rsp, 400
         ret
 sp_3072_sqr_avx2_48 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 ; /* Mul a by digit b into r. (r = a * b)
 ;  *
@@ -30238,7 +30239,7 @@ ENDIF
 ;  * a  A single precision integer.
 ;  * b  A single precision digit.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_3072_mul_d_48 PROC
         push	r12
         mov	r9, rdx
@@ -30627,7 +30628,7 @@ sp_3072_mul_d_48 PROC
         pop	r12
         ret
 sp_3072_mul_d_48 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Conditionally subtract b from a using the mask m.
 ;  * m is -1 to subtract and 0 when not copying.
 ;  *
@@ -30636,7 +30637,7 @@ _text ENDS
 ;  * b  A single precision number to subtract.
 ;  * m  Mask value to apply.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_3072_cond_sub_24 PROC
         sub	rsp, 192
         mov	r10, QWORD PTR [r8]
@@ -30811,14 +30812,14 @@ sp_3072_cond_sub_24 PROC
         add	rsp, 192
         ret
 sp_3072_cond_sub_24 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Reduce the number back to 3072 bits using Montgomery reduction.
 ;  *
 ;  * a   A single precision number to reduce in place.
 ;  * m   The single precision number representing the modulus.
 ;  * mp  The digit representing the negative inverse of m mod 2^n.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_3072_mont_reduce_24 PROC
         push	r12
         push	r13
@@ -31098,7 +31099,7 @@ ENDIF
         pop	r12
         ret
 sp_3072_mont_reduce_24 ENDP
-_text ENDS
+_TEXT ENDS
 IFDEF HAVE_INTEL_AVX2
 ; /* Conditionally subtract b from a using the mask m.
 ;  * m is -1 to subtract and 0 when not copying.
@@ -31108,7 +31109,7 @@ IFDEF HAVE_INTEL_AVX2
 ;  * b  A single precision number to subtract.
 ;  * m  Mask value to apply.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_3072_cond_sub_avx2_24 PROC
         push	r12
         mov	r12, QWORD PTR [r8]
@@ -31235,7 +31236,7 @@ sp_3072_cond_sub_avx2_24 PROC
         pop	r12
         ret
 sp_3072_cond_sub_avx2_24 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 ; /* Mul a by digit b into r. (r = a * b)
 ;  *
@@ -31243,7 +31244,7 @@ ENDIF
 ;  * a  A single precision integer.
 ;  * b  A single precision digit.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_3072_mul_d_24 PROC
         push	r12
         mov	r9, rdx
@@ -31440,7 +31441,7 @@ sp_3072_mul_d_24 PROC
         pop	r12
         ret
 sp_3072_mul_d_24 ENDP
-_text ENDS
+_TEXT ENDS
 IFDEF HAVE_INTEL_AVX2
 ; /* Mul a by digit b into r. (r = a * b)
 ;  *
@@ -31448,7 +31449,7 @@ IFDEF HAVE_INTEL_AVX2
 ;  * a  A single precision integer.
 ;  * b  A single precision digit.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_3072_mul_d_avx2_24 PROC
         push	r12
         push	r13
@@ -31602,7 +31603,7 @@ sp_3072_mul_d_avx2_24 PROC
         pop	r12
         ret
 sp_3072_mul_d_avx2_24 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 IFDEF _WIN64
 ; /* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div)
@@ -31612,7 +31613,7 @@ IFDEF _WIN64
 ;  * div  The dividend.
 ;  * returns the result of the division.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 div_3072_word_asm_24 PROC
         mov	r9, rdx
         mov	rax, r9
@@ -31620,7 +31621,7 @@ div_3072_word_asm_24 PROC
         div	r8
         ret
 div_3072_word_asm_24 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 ; /* Compare a with b in constant time.
 ;  *
@@ -31629,7 +31630,7 @@ ENDIF
 ;  * return -ve, 0 or +ve if a is less than, equal to or greater than b
 ;  * respectively.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_3072_cmp_24 PROC
         push	r12
         xor	r9, r9
@@ -31832,9 +31833,9 @@ sp_3072_cmp_24 PROC
         pop	r12
         ret
 sp_3072_cmp_24 ENDP
-_text ENDS
+_TEXT ENDS
 IFNDEF WC_NO_CACHE_RESISTANT
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_3072_get_from_table_24 PROC
         sub	rsp, 128
         movdqu	OWORD PTR [rsp], xmm6
@@ -33593,7 +33594,7 @@ sp_3072_get_from_table_24 PROC
         add	rsp, 128
         ret
 sp_3072_get_from_table_24 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 IFDEF HAVE_INTEL_AVX2
 ; /* Reduce the number back to 3072 bits using Montgomery reduction.
@@ -33602,7 +33603,7 @@ IFDEF HAVE_INTEL_AVX2
 ;  * m   The single precision number representing the modulus.
 ;  * mp  The digit representing the negative inverse of m mod 2^n.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_3072_mont_reduce_avx2_24 PROC
         push	r12
         push	r13
@@ -33912,10 +33913,10 @@ L_3072_mont_reduce_avx2_24_loop:
         pop	r12
         ret
 sp_3072_mont_reduce_avx2_24 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 IFNDEF WC_NO_CACHE_RESISTANT
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_3072_get_from_table_avx2_24 PROC
         sub	rsp, 128
         vmovdqu	OWORD PTR [rsp], xmm6
@@ -34826,7 +34827,7 @@ sp_3072_get_from_table_avx2_24 PROC
         add	rsp, 128
         ret
 sp_3072_get_from_table_avx2_24 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 ; /* Conditionally subtract b from a using the mask m.
 ;  * m is -1 to subtract and 0 when not copying.
@@ -34836,7 +34837,7 @@ ENDIF
 ;  * b  A single precision number to subtract.
 ;  * m  Mask value to apply.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_3072_cond_sub_48 PROC
         sub	rsp, 384
         mov	r10, QWORD PTR [r8]
@@ -35179,14 +35180,14 @@ sp_3072_cond_sub_48 PROC
         add	rsp, 384
         ret
 sp_3072_cond_sub_48 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Reduce the number back to 3072 bits using Montgomery reduction.
 ;  *
 ;  * a   A single precision number to reduce in place.
 ;  * m   The single precision number representing the modulus.
 ;  * mp  The digit representing the negative inverse of m mod 2^n.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_3072_mont_reduce_48 PROC
         push	r12
         push	r13
@@ -35706,14 +35707,14 @@ ENDIF
         pop	r12
         ret
 sp_3072_mont_reduce_48 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Sub b from a into r. (r = a - b)
 ;  *
 ;  * r  A single precision integer.
 ;  * a  A single precision integer.
 ;  * b  A single precision integer.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_3072_sub_48 PROC
         mov	r9, QWORD PTR [rdx]
         sub	r9, QWORD PTR [r8]
@@ -35862,7 +35863,7 @@ sp_3072_sub_48 PROC
         sbb	rax, rax
         ret
 sp_3072_sub_48 ENDP
-_text ENDS
+_TEXT ENDS
 IFDEF HAVE_INTEL_AVX2
 ; /* Mul a by digit b into r. (r = a * b)
 ;  *
@@ -35870,7 +35871,7 @@ IFDEF HAVE_INTEL_AVX2
 ;  * a  A single precision integer.
 ;  * b  A single precision digit.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_3072_mul_d_avx2_48 PROC
         push	r12
         push	r13
@@ -36168,7 +36169,7 @@ sp_3072_mul_d_avx2_48 PROC
         pop	r12
         ret
 sp_3072_mul_d_avx2_48 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 IFDEF _WIN64
 ; /* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div)
@@ -36178,7 +36179,7 @@ IFDEF _WIN64
 ;  * div  The dividend.
 ;  * returns the result of the division.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 div_3072_word_asm_48 PROC
         mov	r9, rdx
         mov	rax, r9
@@ -36186,7 +36187,7 @@ div_3072_word_asm_48 PROC
         div	r8
         ret
 div_3072_word_asm_48 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 IFDEF HAVE_INTEL_AVX2
 ; /* Conditionally subtract b from a using the mask m.
@@ -36197,7 +36198,7 @@ IFDEF HAVE_INTEL_AVX2
 ;  * b  A single precision number to subtract.
 ;  * m  Mask value to apply.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_3072_cond_sub_avx2_48 PROC
         push	r12
         mov	r12, QWORD PTR [r8]
@@ -36444,7 +36445,7 @@ sp_3072_cond_sub_avx2_48 PROC
         pop	r12
         ret
 sp_3072_cond_sub_avx2_48 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 ; /* Compare a with b in constant time.
 ;  *
@@ -36453,7 +36454,7 @@ ENDIF
 ;  * return -ve, 0 or +ve if a is less than, equal to or greater than b
 ;  * respectively.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_3072_cmp_48 PROC
         push	r12
         xor	r9, r9
@@ -36848,9 +36849,9 @@ sp_3072_cmp_48 PROC
         pop	r12
         ret
 sp_3072_cmp_48 ENDP
-_text ENDS
+_TEXT ENDS
 IFNDEF WC_NO_CACHE_RESISTANT
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_3072_get_from_table_48 PROC
         sub	rsp, 128
         movdqu	OWORD PTR [rsp], xmm6
@@ -38661,7 +38662,7 @@ sp_3072_get_from_table_48 PROC
         add	rsp, 128
         ret
 sp_3072_get_from_table_48 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 IFDEF HAVE_INTEL_AVX2
 ; /* Reduce the number back to 3072 bits using Montgomery reduction.
@@ -38670,7 +38671,7 @@ IFDEF HAVE_INTEL_AVX2
 ;  * m   The single precision number representing the modulus.
 ;  * mp  The digit representing the negative inverse of m mod 2^n.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_3072_mont_reduce_avx2_48 PROC
         push	r12
         push	r13
@@ -39244,10 +39245,10 @@ L_3072_mont_reduce_avx2_48_loop:
         pop	r12
         ret
 sp_3072_mont_reduce_avx2_48 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 IFNDEF WC_NO_CACHE_RESISTANT
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_3072_get_from_table_avx2_48 PROC
         sub	rsp, 128
         vmovdqu	OWORD PTR [rsp], xmm6
@@ -40110,7 +40111,7 @@ sp_3072_get_from_table_avx2_48 PROC
         add	rsp, 128
         ret
 sp_3072_get_from_table_avx2_48 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 ; /* Conditionally add a and b using the mask m.
 ;  * m is -1 to add and 0 when not.
@@ -40120,7 +40121,7 @@ ENDIF
 ;  * b  A single precision number to add.
 ;  * m  Mask value to apply.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_3072_cond_add_24 PROC
         sub	rsp, 192
         mov	rax, 0
@@ -40296,7 +40297,7 @@ sp_3072_cond_add_24 PROC
         add	rsp, 192
         ret
 sp_3072_cond_add_24 ENDP
-_text ENDS
+_TEXT ENDS
 IFDEF HAVE_INTEL_AVX2
 ; /* Conditionally add a and b using the mask m.
 ;  * m is -1 to add and 0 when not.
@@ -40306,7 +40307,7 @@ IFDEF HAVE_INTEL_AVX2
 ;  * b  A single precision number to add.
 ;  * m  Mask value to apply.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_3072_cond_add_avx2_24 PROC
         push	r12
         mov	rax, 0
@@ -40434,7 +40435,7 @@ sp_3072_cond_add_avx2_24 PROC
         pop	r12
         ret
 sp_3072_cond_add_avx2_24 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 ; /* Shift number left by n bit. (r = a << n)
 ;  *
@@ -40442,7 +40443,7 @@ ENDIF
 ;  * a  Number to shift.
 ;  * n  Amoutnt o shift.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_3072_lshift_48 PROC
         push	r12
         push	r13
@@ -40599,7 +40600,7 @@ sp_3072_lshift_48 PROC
         pop	r12
         ret
 sp_3072_lshift_48 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 ENDIF
 IFDEF WOLFSSL_SP_4096
@@ -40612,7 +40613,7 @@ IFDEF WOLFSSL_SP_4096
 ;  * a  Byte array.
 ;  * n  Number of bytes in array to read.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_4096_from_bin_bswap PROC
         push	r12
         push	r13
@@ -40690,7 +40691,7 @@ L_4096_from_bin_bswap_zero_end:
         pop	r12
         ret
 sp_4096_from_bin_bswap ENDP
-_text ENDS
+_TEXT ENDS
 IFNDEF NO_MOVBE_SUPPORT
 ; /* Read big endian unsigned byte array into r.
 ;  * Uses the movbe instruction which is an optional instruction.
@@ -40700,7 +40701,7 @@ IFNDEF NO_MOVBE_SUPPORT
 ;  * a  Byte array.
 ;  * n  Number of bytes in array to read.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_4096_from_bin_movbe PROC
         push	r12
         mov	r11, r8
@@ -40766,7 +40767,7 @@ L_4096_from_bin_movbe_zero_end:
         pop	r12
         ret
 sp_4096_from_bin_movbe ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 ; /* Write r as big endian to byte array.
 ;  * Fixed length number of bytes written: 512
@@ -40775,7 +40776,7 @@ ENDIF
 ;  * r  A single precision integer.
 ;  * a  Byte array.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_4096_to_bin_bswap_64 PROC
         mov	rax, QWORD PTR [rcx+504]
         mov	r8, QWORD PTR [rcx+496]
@@ -40971,7 +40972,7 @@ sp_4096_to_bin_bswap_64 PROC
         mov	QWORD PTR [rdx+504], r8
         ret
 sp_4096_to_bin_bswap_64 ENDP
-_text ENDS
+_TEXT ENDS
 IFNDEF NO_MOVBE_SUPPORT
 ; /* Write r as big endian to byte array.
 ;  * Fixed length number of bytes written: 512
@@ -40980,7 +40981,7 @@ IFNDEF NO_MOVBE_SUPPORT
 ;  * r  A single precision integer.
 ;  * a  Byte array.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_4096_to_bin_movbe_64 PROC
         movbe	rax, QWORD PTR [rcx+504]
         movbe	r8, QWORD PTR [rcx+496]
@@ -41112,14 +41113,14 @@ sp_4096_to_bin_movbe_64 PROC
         mov	QWORD PTR [rdx+504], r8
         ret
 sp_4096_to_bin_movbe_64 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 ; /* Sub b from a into a. (a -= b)
 ;  *
 ;  * a  A single precision integer and result.
 ;  * b  A single precision integer.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_4096_sub_in_place_64 PROC
         mov	r8, QWORD PTR [rcx]
         sub	r8, QWORD PTR [rdx]
@@ -41316,14 +41317,14 @@ sp_4096_sub_in_place_64 PROC
         sbb	rax, rax
         ret
 sp_4096_sub_in_place_64 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Add b to a into r. (r = a + b)
 ;  *
 ;  * r  A single precision integer.
 ;  * a  A single precision integer.
 ;  * b  A single precision integer.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_4096_add_64 PROC
         ; Add
         mov	r9, QWORD PTR [rdx]
@@ -41522,14 +41523,14 @@ sp_4096_add_64 PROC
         adc	rax, 0
         ret
 sp_4096_add_64 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Multiply a and b into r. (r = a * b)
 ;  *
 ;  * r  A single precision integer.
 ;  * a  A single precision integer.
 ;  * b  A single precision integer.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_4096_mul_64 PROC
         push	r12
         push	r13
@@ -42851,7 +42852,7 @@ ENDIF
         pop	r12
         ret
 sp_4096_mul_64 ENDP
-_text ENDS
+_TEXT ENDS
 IFDEF HAVE_INTEL_AVX2
 ; /* Multiply a and b into r. (r = a * b)
 ;  *
@@ -42859,7 +42860,7 @@ IFDEF HAVE_INTEL_AVX2
 ;  * a  A single precision integer.
 ;  * b  A single precision integer.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_4096_mul_avx2_64 PROC
         push	r12
         push	r13
@@ -44085,7 +44086,7 @@ ENDIF
         pop	r12
         ret
 sp_4096_mul_avx2_64 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 ; /* Square a and put result in r. (r = a * a)
 ;  *
@@ -44094,7 +44095,7 @@ ENDIF
 ;  * r  A single precision integer.
 ;  * a  A single precision integer.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_4096_sqr_64 PROC
         sub	rsp, 528
         mov	QWORD PTR [rsp+512], rcx
@@ -45067,7 +45068,7 @@ ENDIF
         add	rsp, 528
         ret
 sp_4096_sqr_64 ENDP
-_text ENDS
+_TEXT ENDS
 IFDEF HAVE_INTEL_AVX2
 ; /* Square a and put result in r. (r = a * a)
 ;  *
@@ -45076,7 +45077,7 @@ IFDEF HAVE_INTEL_AVX2
 ;  * r  A single precision integer.
 ;  * a  A single precision integer.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_4096_sqr_avx2_64 PROC
         sub	rsp, 528
         mov	QWORD PTR [rsp+512], rcx
@@ -46049,7 +46050,7 @@ ENDIF
         add	rsp, 528
         ret
 sp_4096_sqr_avx2_64 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 ; /* Mul a by digit b into r. (r = a * b)
 ;  *
@@ -46057,7 +46058,7 @@ ENDIF
 ;  * a  A single precision integer.
 ;  * b  A single precision digit.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_4096_mul_d_64 PROC
         push	r12
         mov	r9, rdx
@@ -46574,7 +46575,7 @@ sp_4096_mul_d_64 PROC
         pop	r12
         ret
 sp_4096_mul_d_64 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Conditionally subtract b from a using the mask m.
 ;  * m is -1 to subtract and 0 when not copying.
 ;  *
@@ -46583,7 +46584,7 @@ _text ENDS
 ;  * b  A single precision number to subtract.
 ;  * m  Mask value to apply.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_4096_cond_sub_64 PROC
         sub	rsp, 512
         mov	r10, QWORD PTR [r8]
@@ -47038,14 +47039,14 @@ sp_4096_cond_sub_64 PROC
         add	rsp, 512
         ret
 sp_4096_cond_sub_64 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Reduce the number back to 4096 bits using Montgomery reduction.
 ;  *
 ;  * a   A single precision number to reduce in place.
 ;  * m   The single precision number representing the modulus.
 ;  * mp  The digit representing the negative inverse of m mod 2^n.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_4096_mont_reduce_64 PROC
         push	r12
         push	r13
@@ -47725,14 +47726,14 @@ ENDIF
         pop	r12
         ret
 sp_4096_mont_reduce_64 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Sub b from a into r. (r = a - b)
 ;  *
 ;  * r  A single precision integer.
 ;  * a  A single precision integer.
 ;  * b  A single precision integer.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_4096_sub_64 PROC
         mov	r9, QWORD PTR [rdx]
         sub	r9, QWORD PTR [r8]
@@ -47929,7 +47930,7 @@ sp_4096_sub_64 PROC
         sbb	rax, rax
         ret
 sp_4096_sub_64 ENDP
-_text ENDS
+_TEXT ENDS
 IFDEF HAVE_INTEL_AVX2
 ; /* Mul a by digit b into r. (r = a * b)
 ;  *
@@ -47937,7 +47938,7 @@ IFDEF HAVE_INTEL_AVX2
 ;  * a  A single precision integer.
 ;  * b  A single precision digit.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_4096_mul_d_avx2_64 PROC
         push	r12
         push	r13
@@ -48331,7 +48332,7 @@ sp_4096_mul_d_avx2_64 PROC
         pop	r12
         ret
 sp_4096_mul_d_avx2_64 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 IFDEF _WIN64
 ; /* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div)
@@ -48341,7 +48342,7 @@ IFDEF _WIN64
 ;  * div  The dividend.
 ;  * returns the result of the division.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 div_4096_word_asm_64 PROC
         mov	r9, rdx
         mov	rax, r9
@@ -48349,7 +48350,7 @@ div_4096_word_asm_64 PROC
         div	r8
         ret
 div_4096_word_asm_64 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 IFDEF HAVE_INTEL_AVX2
 ; /* Conditionally subtract b from a using the mask m.
@@ -48360,7 +48361,7 @@ IFDEF HAVE_INTEL_AVX2
 ;  * b  A single precision number to subtract.
 ;  * m  Mask value to apply.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_4096_cond_sub_avx2_64 PROC
         push	r12
         mov	r12, QWORD PTR [r8]
@@ -48687,7 +48688,7 @@ sp_4096_cond_sub_avx2_64 PROC
         pop	r12
         ret
 sp_4096_cond_sub_avx2_64 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 ; /* Compare a with b in constant time.
 ;  *
@@ -48696,7 +48697,7 @@ ENDIF
 ;  * return -ve, 0 or +ve if a is less than, equal to or greater than b
 ;  * respectively.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_4096_cmp_64 PROC
         push	r12
         xor	r9, r9
@@ -49219,9 +49220,9 @@ sp_4096_cmp_64 PROC
         pop	r12
         ret
 sp_4096_cmp_64 ENDP
-_text ENDS
+_TEXT ENDS
 IFNDEF WC_NO_CACHE_RESISTANT
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_4096_get_from_table_64 PROC
         sub	rsp, 128
         movdqu	OWORD PTR [rsp], xmm6
@@ -51632,7 +51633,7 @@ sp_4096_get_from_table_64 PROC
         add	rsp, 128
         ret
 sp_4096_get_from_table_64 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 IFDEF HAVE_INTEL_AVX2
 ; /* Reduce the number back to 4096 bits using Montgomery reduction.
@@ -51641,7 +51642,7 @@ IFDEF HAVE_INTEL_AVX2
 ;  * m   The single precision number representing the modulus.
 ;  * mp  The digit representing the negative inverse of m mod 2^n.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_4096_mont_reduce_avx2_64 PROC
         push	r12
         push	r13
@@ -52391,10 +52392,10 @@ L_4096_mont_reduce_avx2_64_loop:
         pop	r12
         ret
 sp_4096_mont_reduce_avx2_64 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 IFNDEF WC_NO_CACHE_RESISTANT
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_4096_get_from_table_avx2_64 PROC
         sub	rsp, 128
         vmovdqu	OWORD PTR [rsp], xmm6
@@ -53541,7 +53542,7 @@ sp_4096_get_from_table_avx2_64 PROC
         add	rsp, 128
         ret
 sp_4096_get_from_table_avx2_64 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 ; /* Conditionally add a and b using the mask m.
 ;  * m is -1 to add and 0 when not.
@@ -53551,7 +53552,7 @@ ENDIF
 ;  * b  A single precision number to add.
 ;  * m  Mask value to apply.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_4096_cond_add_32 PROC
         sub	rsp, 256
         mov	rax, 0
@@ -53783,7 +53784,7 @@ sp_4096_cond_add_32 PROC
         add	rsp, 256
         ret
 sp_4096_cond_add_32 ENDP
-_text ENDS
+_TEXT ENDS
 IFDEF HAVE_INTEL_AVX2
 ; /* Conditionally add a and b using the mask m.
 ;  * m is -1 to add and 0 when not.
@@ -53793,7 +53794,7 @@ IFDEF HAVE_INTEL_AVX2
 ;  * b  A single precision number to add.
 ;  * m  Mask value to apply.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_4096_cond_add_avx2_32 PROC
         push	r12
         mov	rax, 0
@@ -53961,7 +53962,7 @@ sp_4096_cond_add_avx2_32 PROC
         pop	r12
         ret
 sp_4096_cond_add_avx2_32 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 ; /* Shift number left by n bit. (r = a << n)
 ;  *
@@ -53969,7 +53970,7 @@ ENDIF
 ;  * a  Number to shift.
 ;  * n  Amoutnt o shift.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_4096_lshift_64 PROC
         push	r12
         push	r13
@@ -54174,7 +54175,7 @@ sp_4096_lshift_64 PROC
         pop	r12
         ret
 sp_4096_lshift_64 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 ENDIF
 IFNDEF WOLFSSL_SP_NO_256
@@ -54184,7 +54185,7 @@ IFNDEF WOLFSSL_SP_NO_256
 ;  * a  A single precision integer.
 ;  * b  A single precision integer.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_256_mul_4 PROC
         push	r12
         mov	r9, rdx
@@ -54308,7 +54309,7 @@ sp_256_mul_4 PROC
         pop	r12
         ret
 sp_256_mul_4 ENDP
-_text ENDS
+_TEXT ENDS
 IFDEF HAVE_INTEL_AVX2
 ; /* Multiply a and b into r. (r = a * b)
 ;  *
@@ -54316,7 +54317,7 @@ IFDEF HAVE_INTEL_AVX2
 ;  * a   First number to multiply.
 ;  * b   Second number to multiply.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_256_mul_avx2_4 PROC
         push	rbp
         push	r12
@@ -54418,14 +54419,14 @@ sp_256_mul_avx2_4 PROC
         pop	rbp
         ret
 sp_256_mul_avx2_4 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 ; /* Square a and put result in r. (r = a * a)
 ;  *
 ;  * r  A single precision integer.
 ;  * a  A single precision integer.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_256_sqr_4 PROC
         push	r12
         push	r13
@@ -54535,14 +54536,14 @@ sp_256_sqr_4 PROC
         pop	r12
         ret
 sp_256_sqr_4 ENDP
-_text ENDS
+_TEXT ENDS
 IFDEF HAVE_INTEL_AVX2
 ; /* Square a and put result in r. (r = a * a)
 ;  *
 ;  * r   Result of squaring.
 ;  * a   Number to square in Montgomery form.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_256_sqr_avx2_4 PROC
         push	r12
         push	r13
@@ -54626,7 +54627,7 @@ sp_256_sqr_avx2_4 PROC
         pop	r12
         ret
 sp_256_sqr_avx2_4 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 ; /* Add b to a into r. (r = a + b)
 ;  *
@@ -54634,7 +54635,7 @@ ENDIF
 ;  * a  A single precision integer.
 ;  * b  A single precision integer.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_256_add_4 PROC
         push	r12
         xor	rax, rax
@@ -54654,14 +54655,14 @@ sp_256_add_4 PROC
         pop	r12
         ret
 sp_256_add_4 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Sub b from a into r. (r = a - b)
 ;  *
 ;  * r  A single precision integer.
 ;  * a  A single precision integer.
 ;  * b  A single precision integer.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_256_sub_4 PROC
         push	r12
         xor	rax, rax
@@ -54681,7 +54682,7 @@ sp_256_sub_4 PROC
         pop	r12
         ret
 sp_256_sub_4 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Conditionally copy a into r using the mask m.
 ;  * m is -1 to copy and 0 when not.
 ;  *
@@ -54689,7 +54690,7 @@ _text ENDS
 ;  * a  A single precision number to copy.
 ;  * m  Mask value to apply.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_256_cond_copy_4 PROC
         mov	rax, QWORD PTR [rcx]
         mov	r9, QWORD PTR [rcx+8]
@@ -54709,7 +54710,7 @@ sp_256_cond_copy_4 PROC
         xor	QWORD PTR [rcx+24], r11
         ret
 sp_256_cond_copy_4 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Multiply two Montgomery form numbers mod the modulus (prime).
 ;  * (r = a * b mod m)
 ;  *
@@ -54719,7 +54720,7 @@ _text ENDS
 ;  * m   Modulus (prime).
 ;  * mp  Montgomery multiplier.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_256_mont_mul_4 PROC
         push	r12
         push	r13
@@ -54907,7 +54908,7 @@ sp_256_mont_mul_4 PROC
         pop	r12
         ret
 sp_256_mont_mul_4 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Square the Montgomery form number mod the modulus (prime). (r = a * a mod m)
 ;  *
 ;  * r   Result of squaring.
@@ -54915,7 +54916,7 @@ _text ENDS
 ;  * m   Modulus (prime).
 ;  * mp  Montgomery multiplier.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_256_mont_sqr_4 PROC
         push	r12
         push	r13
@@ -55082,7 +55083,7 @@ sp_256_mont_sqr_4 PROC
         pop	r12
         ret
 sp_256_mont_sqr_4 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Compare a with b in constant time.
 ;  *
 ;  * a  A single precision integer.
@@ -55090,7 +55091,7 @@ _text ENDS
 ;  * return -ve, 0 or +ve if a is less than, equal to or greater than b
 ;  * respectively.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_256_cmp_4 PROC
         push	r12
         xor	r9, r9
@@ -55133,7 +55134,7 @@ sp_256_cmp_4 PROC
         pop	r12
         ret
 sp_256_cmp_4 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Conditionally subtract b from a using the mask m.
 ;  * m is -1 to subtract and 0 when not copying.
 ;  *
@@ -55142,7 +55143,7 @@ _text ENDS
 ;  * b  A single precision number to subtract.
 ;  * m  Mask value to apply.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_256_cond_sub_4 PROC
         push	r12
         push	r13
@@ -55179,14 +55180,14 @@ sp_256_cond_sub_4 PROC
         pop	r12
         ret
 sp_256_cond_sub_4 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Reduce the number back to 256 bits using Montgomery reduction.
 ;  *
 ;  * a   A single precision number to reduce in place.
 ;  * m   The single precision number representing the modulus.
 ;  * mp  The digit representing the negative inverse of m mod 2^n.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_256_mont_reduce_4 PROC
         push	rbx
         push	rsi
@@ -55284,14 +55285,14 @@ sp_256_mont_reduce_4 PROC
         pop	rbx
         ret
 sp_256_mont_reduce_4 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Reduce the number back to 256 bits using Montgomery reduction.
 ;  *
 ;  * a   A single precision number to reduce in place.
 ;  * m   The single precision number representing the modulus.
 ;  * mp  The digit representing the negative inverse of m mod 2^n.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_256_mont_reduce_order_4 PROC
         push	r12
         push	r13
@@ -55385,7 +55386,7 @@ L_mont_loop_4:
         pop	r12
         ret
 sp_256_mont_reduce_order_4 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Add two Montgomery form numbers (r = a + b % m).
 ;  *
 ;  * r   Result of addition.
@@ -55393,7 +55394,7 @@ _text ENDS
 ;  * b   Second number to add in Montgomery form.
 ;  * m   Modulus (prime).
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_256_mont_add_4 PROC
         push	r12
         push	r13
@@ -55428,14 +55429,14 @@ sp_256_mont_add_4 PROC
         pop	r12
         ret
 sp_256_mont_add_4 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Double a Montgomery form number (r = a + a % m).
 ;  *
 ;  * r   Result of doubling.
 ;  * a   Number to double in Montgomery form.
 ;  * m   Modulus (prime).
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_256_mont_dbl_4 PROC
         push	r12
         push	r13
@@ -55471,14 +55472,14 @@ sp_256_mont_dbl_4 PROC
         pop	r12
         ret
 sp_256_mont_dbl_4 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Triple a Montgomery form number (r = a + a + a % m).
 ;  *
 ;  * r   Result of Tripling.
 ;  * a   Number to triple in Montgomery form.
 ;  * m   Modulus (prime).
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_256_mont_tpl_4 PROC
         push	r12
         push	r13
@@ -55532,7 +55533,7 @@ sp_256_mont_tpl_4 PROC
         pop	r12
         ret
 sp_256_mont_tpl_4 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Subtract two Montgomery form numbers (r = a - b % m).
 ;  *
 ;  * r   Result of subtration.
@@ -55540,7 +55541,7 @@ _text ENDS
 ;  * b   Number to subtract with in Montgomery form.
 ;  * m   Modulus (prime).
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_256_mont_sub_4 PROC
         push	r12
         push	r13
@@ -55575,14 +55576,14 @@ sp_256_mont_sub_4 PROC
         pop	r12
         ret
 sp_256_mont_sub_4 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Divide the number by 2 mod the modulus (prime). (r = a / 2 % m)
 ;  *
 ;  * r  Result of division by 2.
 ;  * a  Number to divide.
 ;  * m  Modulus (prime).
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_256_mont_div2_4 PROC
         push	r12
         push	r13
@@ -55614,7 +55615,7 @@ sp_256_mont_div2_4 PROC
         pop	r12
         ret
 sp_256_mont_div2_4 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Two Montgomery numbers, subtract double second from first (r = a - 2.b % m).
 ;  *
 ;  * r   Result of subtration.
@@ -55622,7 +55623,7 @@ _text ENDS
 ;  * b   Number to double and subtract with in Montgomery form.
 ;  * m   Modulus (prime).
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_256_mont_rsb_sub_dbl_4 PROC
         push	r12
         push	r13
@@ -55715,7 +55716,7 @@ sp_256_mont_rsb_sub_dbl_4 PROC
         pop	r12
         ret
 sp_256_mont_rsb_sub_dbl_4 ENDP
-_text ENDS
+_TEXT ENDS
 IFNDEF WC_NO_CACHE_RESISTANT
 ; /* Touch each possible point that could be being copied.
 ;  *
@@ -55723,7 +55724,7 @@ IFNDEF WC_NO_CACHE_RESISTANT
 ;  * table  Table - start of the entries to access
 ;  * idx    Index of point to retrieve.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_256_get_point_33_4 PROC
         sub	rsp, 160
         movdqu	OWORD PTR [rsp], xmm6
@@ -55795,7 +55796,7 @@ L_256_get_point_33_4_start_1:
         add	rsp, 160
         ret
 sp_256_get_point_33_4 ENDP
-_text ENDS
+_TEXT ENDS
 IFDEF HAVE_INTEL_AVX2
 ; /* Touch each possible point that could be being copied.
 ;  *
@@ -55803,7 +55804,7 @@ IFDEF HAVE_INTEL_AVX2
 ;  * table  Table - start of the entries to access
 ;  * idx    Index of point to retrieve.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_256_get_point_33_avx2_4 PROC
         sub	rsp, 64
         vmovdqu	OWORD PTR [rsp], xmm6
@@ -55847,7 +55848,7 @@ L_256_get_point_33_avx2_4_start:
         add	rsp, 64
         ret
 sp_256_get_point_33_avx2_4 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 ENDIF
 IFDEF HAVE_INTEL_AVX2
@@ -55860,7 +55861,7 @@ IFDEF HAVE_INTEL_AVX2
 ;  * m   Modulus (prime).
 ;  * mp  Montgomery multiplier.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_256_mont_mul_avx2_4 PROC
         push	rbp
         push	r12
@@ -56025,7 +56026,7 @@ sp_256_mont_mul_avx2_4 PROC
         pop	rbp
         ret
 sp_256_mont_mul_avx2_4 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 IFDEF HAVE_INTEL_AVX2
 ; /* Square the Montgomery form number mod the modulus (prime). (r = a * a mod m)
@@ -56035,7 +56036,7 @@ IFDEF HAVE_INTEL_AVX2
 ;  * m   Modulus (prime).
 ;  * mp  Montgomery multiplier.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_256_mont_sqr_avx2_4 PROC
         push	r12
         push	r13
@@ -56182,7 +56183,7 @@ sp_256_mont_sqr_avx2_4 PROC
         pop	r12
         ret
 sp_256_mont_sqr_avx2_4 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 IFDEF HAVE_INTEL_AVX2
 ; /* Conditionally subtract b from a using the mask m.
@@ -56193,7 +56194,7 @@ IFDEF HAVE_INTEL_AVX2
 ;  * b  A single precision number to subtract.
 ;  * m  Mask value to apply.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_256_cond_sub_avx2_4 PROC
         push	r12
         push	r13
@@ -56230,7 +56231,7 @@ sp_256_cond_sub_avx2_4 PROC
         pop	r12
         ret
 sp_256_cond_sub_avx2_4 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 IFDEF HAVE_INTEL_AVX2
 ; /* Reduce the number back to 256 bits using Montgomery reduction.
@@ -56239,7 +56240,7 @@ IFDEF HAVE_INTEL_AVX2
 ;  * m   The single precision number representing the modulus.
 ;  * mp  The digit representing the negative inverse of m mod 2^n.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_256_mont_reduce_order_avx2_4 PROC
         push	r12
         push	r13
@@ -56389,7 +56390,7 @@ sp_256_mont_reduce_order_avx2_4 PROC
         pop	r12
         ret
 sp_256_mont_reduce_order_avx2_4 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 IFDEF HAVE_INTEL_AVX2
 ; /* Divide the number by 2 mod the modulus (prime). (r = a / 2 % m)
@@ -56398,7 +56399,7 @@ IFDEF HAVE_INTEL_AVX2
 ;  * a  Number to divide.
 ;  * m  Modulus (prime).
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_256_mont_div2_avx2_4 PROC
         push	r12
         push	r13
@@ -56430,7 +56431,7 @@ sp_256_mont_div2_avx2_4 PROC
         pop	r12
         ret
 sp_256_mont_div2_avx2_4 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 IFNDEF WC_NO_CACHE_RESISTANT
 ; /* Touch each possible entry that could be being copied.
@@ -56439,7 +56440,7 @@ IFNDEF WC_NO_CACHE_RESISTANT
 ;  * table  Table - start of the entries to access
 ;  * idx    Index of entry to retrieve.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_256_get_entry_64_4 PROC
         sub	rsp, 96
         movdqu	OWORD PTR [rsp], xmm6
@@ -56494,7 +56495,7 @@ L_256_get_entry_64_4_start_0:
         add	rsp, 96
         ret
 sp_256_get_entry_64_4 ENDP
-_text ENDS
+_TEXT ENDS
 IFDEF HAVE_INTEL_AVX2
 ; /* Touch each possible entry that could be being copied.
 ;  *
@@ -56502,7 +56503,7 @@ IFDEF HAVE_INTEL_AVX2
 ;  * table  Table - start of the entries to access
 ;  * idx    Index of entry to retrieve.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_256_get_entry_64_avx2_4 PROC
         sub	rsp, 32
         vmovdqu	OWORD PTR [rsp], xmm6
@@ -56537,7 +56538,7 @@ L_256_get_entry_64_avx2_4_start:
         add	rsp, 32
         ret
 sp_256_get_entry_64_avx2_4 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 ENDIF
 IFNDEF WC_NO_CACHE_RESISTANT
@@ -56547,7 +56548,7 @@ IFNDEF WC_NO_CACHE_RESISTANT
 ;  * table  Table - start of the entries to access
 ;  * idx    Index of entry to retrieve.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_256_get_entry_65_4 PROC
         sub	rsp, 96
         movdqu	OWORD PTR [rsp], xmm6
@@ -56602,7 +56603,7 @@ L_256_get_entry_65_4_start_0:
         add	rsp, 96
         ret
 sp_256_get_entry_65_4 ENDP
-_text ENDS
+_TEXT ENDS
 IFDEF HAVE_INTEL_AVX2
 ; /* Touch each possible entry that could be being copied.
 ;  *
@@ -56610,7 +56611,7 @@ IFDEF HAVE_INTEL_AVX2
 ;  * table  Table - start of the entries to access
 ;  * idx    Index of entry to retrieve.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_256_get_entry_65_avx2_4 PROC
         sub	rsp, 32
         vmovdqu	OWORD PTR [rsp], xmm6
@@ -56645,14 +56646,14 @@ L_256_get_entry_65_avx2_4_start:
         add	rsp, 32
         ret
 sp_256_get_entry_65_avx2_4 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 ENDIF
 ; /* Add 1 to a. (a = a + 1)
 ;  *
 ;  * a  A single precision integer.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_256_add_one_4 PROC
         add	QWORD PTR [rcx], 1
         adc	QWORD PTR [rcx+8], 0
@@ -56660,7 +56661,7 @@ sp_256_add_one_4 PROC
         adc	QWORD PTR [rcx+24], 0
         ret
 sp_256_add_one_4 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Read big endian unsigned byte array into r.
 ;  * Uses the bswap instruction.
 ;  *
@@ -56669,7 +56670,7 @@ _text ENDS
 ;  * a  Byte array.
 ;  * n  Number of bytes in array to read.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_256_from_bin_bswap PROC
         push	r12
         push	r13
@@ -56747,7 +56748,7 @@ L_256_from_bin_bswap_zero_end:
         pop	r12
         ret
 sp_256_from_bin_bswap ENDP
-_text ENDS
+_TEXT ENDS
 IFNDEF NO_MOVBE_SUPPORT
 ; /* Read big endian unsigned byte array into r.
 ;  * Uses the movbe instruction which is an optional instruction.
@@ -56757,7 +56758,7 @@ IFNDEF NO_MOVBE_SUPPORT
 ;  * a  Byte array.
 ;  * n  Number of bytes in array to read.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_256_from_bin_movbe PROC
         push	r12
         mov	r11, r8
@@ -56823,7 +56824,7 @@ L_256_from_bin_movbe_zero_end:
         pop	r12
         ret
 sp_256_from_bin_movbe ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 ; /* Write r as big endian to byte array.
 ;  * Fixed length number of bytes written: 32
@@ -56832,7 +56833,7 @@ ENDIF
 ;  * r  A single precision integer.
 ;  * a  Byte array.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_256_to_bin_bswap_4 PROC
         mov	rax, QWORD PTR [rcx+24]
         mov	r8, QWORD PTR [rcx+16]
@@ -56848,7 +56849,7 @@ sp_256_to_bin_bswap_4 PROC
         mov	QWORD PTR [rdx+24], r8
         ret
 sp_256_to_bin_bswap_4 ENDP
-_text ENDS
+_TEXT ENDS
 IFNDEF NO_MOVBE_SUPPORT
 ; /* Write r as big endian to byte array.
 ;  * Fixed length number of bytes written: 32
@@ -56857,7 +56858,7 @@ IFNDEF NO_MOVBE_SUPPORT
 ;  * r  A single precision integer.
 ;  * a  Byte array.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_256_to_bin_movbe_4 PROC
         movbe	rax, QWORD PTR [rcx+24]
         movbe	r8, QWORD PTR [rcx+16]
@@ -56869,14 +56870,14 @@ sp_256_to_bin_movbe_4 PROC
         mov	QWORD PTR [rdx+24], r8
         ret
 sp_256_to_bin_movbe_4 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 ; /* Sub b from a into a. (a -= b)
 ;  *
 ;  * a  A single precision integer and result.
 ;  * b  A single precision integer.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_256_sub_in_place_4 PROC
         mov	r8, QWORD PTR [rdx]
         mov	r9, QWORD PTR [rdx+8]
@@ -56889,14 +56890,14 @@ sp_256_sub_in_place_4 PROC
         sbb	rax, rax
         ret
 sp_256_sub_in_place_4 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Mul a by digit b into r. (r = a * b)
 ;  *
 ;  * r  A single precision integer.
 ;  * a  A single precision integer.
 ;  * b  A single precision digit.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_256_mul_d_4 PROC
         push	r12
         mov	r9, rdx
@@ -56933,7 +56934,7 @@ sp_256_mul_d_4 PROC
         pop	r12
         ret
 sp_256_mul_d_4 ENDP
-_text ENDS
+_TEXT ENDS
 IFDEF HAVE_INTEL_AVX2
 ; /* Mul a by digit b into r. (r = a * b)
 ;  *
@@ -56941,7 +56942,7 @@ IFDEF HAVE_INTEL_AVX2
 ;  * a  A single precision integer.
 ;  * b  A single precision digit.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_256_mul_d_avx2_4 PROC
         push	r12
         push	r13
@@ -56975,7 +56976,7 @@ sp_256_mul_d_avx2_4 PROC
         pop	r12
         ret
 sp_256_mul_d_avx2_4 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 IFDEF _WIN64
 ; /* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div)
@@ -56985,7 +56986,7 @@ IFDEF _WIN64
 ;  * div  The dividend.
 ;  * returns the result of the division.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 div_256_word_asm_4 PROC
         mov	r9, rdx
         mov	rax, r9
@@ -56993,7 +56994,7 @@ div_256_word_asm_4 PROC
         div	r8
         ret
 div_256_word_asm_4 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 IFDEF HAVE_INTEL_AVX2
 ; /* Multiply two Montgomery form numbers mod the modulus (prime).
@@ -57003,7 +57004,7 @@ IFDEF HAVE_INTEL_AVX2
 ;  * a   First number to multiply in Montgomery form.
 ;  * b   Second number to multiply in Montgomery form.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_256_mont_mul_order_avx2_4 PROC
         push	rbp
         push	r12
@@ -57214,7 +57215,7 @@ sp_256_mont_mul_order_avx2_4 PROC
         pop	rbp
         ret
 sp_256_mont_mul_order_avx2_4 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 IFDEF HAVE_INTEL_AVX2
 ; /* Square the Montgomery form number mod the modulus (prime). (r = a * a mod m)
@@ -57222,7 +57223,7 @@ IFDEF HAVE_INTEL_AVX2
 ;  * r   Result of squaring.
 ;  * a   Number to square in Montgomery form.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_256_mont_sqr_order_avx2_4 PROC
         push	rbp
         push	r12
@@ -57417,7 +57418,7 @@ sp_256_mont_sqr_order_avx2_4 PROC
         pop	rbp
         ret
 sp_256_mont_sqr_order_avx2_4 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 ; /* Non-constant time modular inversion.
 ;  *
@@ -57426,7 +57427,7 @@ ENDIF
 ;  * @param  [in]   m   Modulus.
 ;  * @return  MP_OKAY on success.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_256_mod_inv_4 PROC
         push	r12
         push	r13
@@ -57638,47 +57639,64 @@ L_256_mod_inv_4_store_end:
         pop	r12
         ret
 sp_256_mod_inv_4 ENDP
-_text ENDS
+_TEXT ENDS
 IFDEF HAVE_INTEL_AVX2
 _DATA SEGMENT
 ALIGN 16
-L_sp256_mod_inv_avx2_4_order DWORD 6497617,32001851,62711546,67108863,67043328,0,0,0,41070783,45522014,67108863,1023,4194303,0,0,0
+L_sp256_mod_inv_avx2_4_order DWORD \
+     00632551h,  01e84f3bh,  03bce6fah,  03ffffffh,
+     03ff0000h,  00000000h,  00000000h,  00000000h,
+     0272b0bfh,  02b69c5eh,  03ffffffh,  000003ffh,
+     003fffffh,  00000000h,  00000000h,  00000000h
 ptr_L_sp256_mod_inv_avx2_4_order QWORD L_sp256_mod_inv_avx2_4_order
 _DATA ENDS
 _DATA SEGMENT
 ALIGN 16
-L_sp256_mod_inv_avx2_4_one QWORD 1, 0,
-    0, 0
+L_sp256_mod_inv_avx2_4_one QWORD \
+     0000000000000001h,  0000000000000000h,
+     0000000000000000h,  0000000000000000h
 ptr_L_sp256_mod_inv_avx2_4_one QWORD L_sp256_mod_inv_avx2_4_one
 _DATA ENDS
 _DATA SEGMENT
 ALIGN 16
-L_sp256_mod_inv_avx2_4_all_one DWORD 1,1,1,1,1,1,1,1
+L_sp256_mod_inv_avx2_4_all_one DWORD \
+     00000001h,  00000001h,  00000001h,  00000001h,
+     00000001h,  00000001h,  00000001h,  00000001h
 ptr_L_sp256_mod_inv_avx2_4_all_one QWORD L_sp256_mod_inv_avx2_4_all_one
 _DATA ENDS
 _DATA SEGMENT
 ALIGN 16
-L_sp256_mod_inv_avx2_4_mask01111 DWORD 0,1,1,1,1,0,0,0
+L_sp256_mod_inv_avx2_4_mask01111 DWORD \
+     00000000h,  00000001h,  00000001h,  00000001h,
+     00000001h,  00000000h,  00000000h,  00000000h
 ptr_L_sp256_mod_inv_avx2_4_mask01111 QWORD L_sp256_mod_inv_avx2_4_mask01111
 _DATA ENDS
 _DATA SEGMENT
 ALIGN 16
-L_sp256_mod_inv_avx2_4_down_one_dword DWORD 1,2,3,4,5,6,7,7
+L_sp256_mod_inv_avx2_4_down_one_dword DWORD \
+     00000001h,  00000002h,  00000003h,  00000004h,
+     00000005h,  00000006h,  00000007h,  00000007h
 ptr_L_sp256_mod_inv_avx2_4_down_one_dword QWORD L_sp256_mod_inv_avx2_4_down_one_dword
 _DATA ENDS
 _DATA SEGMENT
 ALIGN 16
-L_sp256_mod_inv_avx2_4_neg DWORD 0,0,0,0,2147483648,0,0,0
+L_sp256_mod_inv_avx2_4_neg DWORD \
+     00000000h,  00000000h,  00000000h,  00000000h,
+     80000000h,  00000000h,  00000000h,  00000000h
 ptr_L_sp256_mod_inv_avx2_4_neg QWORD L_sp256_mod_inv_avx2_4_neg
 _DATA ENDS
 _DATA SEGMENT
 ALIGN 16
-L_sp256_mod_inv_avx2_4_up_one_dword DWORD 7,0,1,2,3,7,7,7
+L_sp256_mod_inv_avx2_4_up_one_dword DWORD \
+     00000007h,  00000000h,  00000001h,  00000002h,
+     00000003h,  00000007h,  00000007h,  00000007h
 ptr_L_sp256_mod_inv_avx2_4_up_one_dword QWORD L_sp256_mod_inv_avx2_4_up_one_dword
 _DATA ENDS
 _DATA SEGMENT
 ALIGN 16
-L_sp256_mod_inv_avx2_4_mask26 DWORD 67108863,67108863,67108863,67108863,67108863,0,0,0
+L_sp256_mod_inv_avx2_4_mask26 DWORD \
+     03ffffffh,  03ffffffh,  03ffffffh,  03ffffffh,
+     03ffffffh,  00000000h,  00000000h,  00000000h
 ptr_L_sp256_mod_inv_avx2_4_mask26 QWORD L_sp256_mod_inv_avx2_4_mask26
 _DATA ENDS
 ; /* Non-constant time modular inversion.
@@ -57688,7 +57706,7 @@ _DATA ENDS
 ;  * @param  [in]   m   Modulus.
 ;  * @return  MP_OKAY on success.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_256_mod_inv_avx2_4 PROC
         push	r12
         push	r13
@@ -57820,8 +57838,8 @@ L_256_mod_inv_avx2_4_usubv_sub_shr1:
         vpextrd	r11d, xmm1, 1
         vpextrd	r13d, xmm1, 2
         vpextrd	r15d, xmm1, 3
-        vextracti128 	xmm0, ymm0, 1
-        vextracti128 	xmm1, ymm1, 1
+        vextracti128	xmm0, ymm0, 1
+        vextracti128	xmm1, ymm1, 1
         vpextrd	edi, xmm0, 0
         vpextrd	esi, xmm1, 0
         jmp	L_256_mod_inv_avx2_4_store_done
@@ -57873,8 +57891,8 @@ L_256_mod_inv_avx2_4_vsubu_sub_shr1:
         vpextrd	r11d, xmm3, 1
         vpextrd	r13d, xmm3, 2
         vpextrd	r15d, xmm3, 3
-        vextracti128 	xmm2, ymm2, 1
-        vextracti128 	xmm3, ymm3, 1
+        vextracti128	xmm2, ymm2, 1
+        vextracti128	xmm3, ymm3, 1
         vpextrd	edi, xmm2, 0
         vpextrd	esi, xmm3, 0
 L_256_mod_inv_avx2_4_store_done:
@@ -57934,7 +57952,7 @@ L_256_mod_inv_avx2_4_store_done:
         adc	r14, r15
         movsxd	rdi, edi
         adc	rdi, rsi
-        jge	L_256_mod_inv_avx2_4_3_no_add_order
+        jge	L_256_mod_inv_avx2_4_no_add_order
         mov	r9, 2756213597218129
         mov	r11, 3054930678533947
         mov	r13, 4503599622973178
@@ -57962,7 +57980,7 @@ L_256_mod_inv_avx2_4_store_done:
         and	r14, rdx
         sar	r15, 52
         add	rdi, r15
-L_256_mod_inv_avx2_4_3_no_add_order:
+L_256_mod_inv_avx2_4_no_add_order:
         mov	r9, r10
         mov	r11, r12
         mov	r13, r14
@@ -58000,7 +58018,7 @@ L_256_mod_inv_avx2_4_3_no_add_order:
         pop	r12
         ret
 sp_256_mod_inv_avx2_4 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 ENDIF
 IFDEF WOLFSSL_SP_384
@@ -58010,7 +58028,7 @@ IFDEF WOLFSSL_SP_384
 ;  * a  A single precision integer.
 ;  * b  A single precision integer.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_384_mul_6 PROC
         push	r12
         mov	r9, rdx
@@ -58266,7 +58284,7 @@ sp_384_mul_6 PROC
         pop	r12
         ret
 sp_384_mul_6 ENDP
-_text ENDS
+_TEXT ENDS
 IFDEF HAVE_INTEL_AVX2
 ; /* Multiply a and b into r. (r = a * b)
 ;  *
@@ -58274,7 +58292,7 @@ IFDEF HAVE_INTEL_AVX2
 ;  * a   First number to multiply.
 ;  * b   Second number to multiply.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_384_mul_avx2_6 PROC
         push	r12
         push	r13
@@ -58482,14 +58500,14 @@ sp_384_mul_avx2_6 PROC
         pop	r12
         ret
 sp_384_mul_avx2_6 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 ; /* Square a and put result in r. (r = a * a)
 ;  *
 ;  * r  A single precision integer.
 ;  * a  A single precision integer.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_384_sqr_6 PROC
         push	r12
         push	r13
@@ -58701,14 +58719,14 @@ sp_384_sqr_6 PROC
         pop	r12
         ret
 sp_384_sqr_6 ENDP
-_text ENDS
+_TEXT ENDS
 IFDEF HAVE_INTEL_AVX2
 ; /* Square a and put result in r. (r = a * a)
 ;  *
 ;  * r   Result of squaring.
 ;  * a   Number to square in Montgomery form.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_384_sqr_avx2_6 PROC
         push	r12
         push	r13
@@ -58858,7 +58876,7 @@ sp_384_sqr_avx2_6 PROC
         pop	r12
         ret
 sp_384_sqr_avx2_6 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 ; /* Add b to a into r. (r = a + b)
 ;  *
@@ -58866,7 +58884,7 @@ ENDIF
 ;  * a  A single precision integer.
 ;  * b  A single precision integer.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_384_add_6 PROC
         push	r12
         push	r13
@@ -58896,14 +58914,14 @@ sp_384_add_6 PROC
         pop	r12
         ret
 sp_384_add_6 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Sub b from a into r. (r = a - b)
 ;  *
 ;  * r  A single precision integer.
 ;  * a  A single precision integer.
 ;  * b  A single precision integer.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_384_sub_6 PROC
         push	r12
         push	r13
@@ -58933,7 +58951,7 @@ sp_384_sub_6 PROC
         pop	r12
         ret
 sp_384_sub_6 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Conditionally copy a into r using the mask m.
 ;  * m is -1 to copy and 0 when not.
 ;  *
@@ -58941,7 +58959,7 @@ _text ENDS
 ;  * a  A single precision number to copy.
 ;  * m  Mask value to apply.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_384_cond_copy_6 PROC
         push	r12
         push	r13
@@ -58973,7 +58991,7 @@ sp_384_cond_copy_6 PROC
         pop	r12
         ret
 sp_384_cond_copy_6 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Conditionally subtract b from a using the mask m.
 ;  * m is -1 to subtract and 0 when not copying.
 ;  *
@@ -58982,7 +59000,7 @@ _text ENDS
 ;  * b  A single precision number to subtract.
 ;  * m  Mask value to apply.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_384_cond_sub_6 PROC
         sub	rsp, 48
         mov	r10, QWORD PTR [r8]
@@ -59031,14 +59049,14 @@ sp_384_cond_sub_6 PROC
         add	rsp, 48
         ret
 sp_384_cond_sub_6 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Reduce the number back to 384 bits using Montgomery reduction.
 ;  *
 ;  * a   A single precision number to reduce in place.
 ;  * m   The single precision number representing the modulus.
 ;  * mp  The digit representing the negative inverse of m mod 2^n.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_384_mont_reduce_6 PROC
         push	r12
         push	r13
@@ -59203,14 +59221,14 @@ sp_384_mont_reduce_6 PROC
         pop	r12
         ret
 sp_384_mont_reduce_6 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Reduce the number back to 384 bits using Montgomery reduction.
 ;  *
 ;  * a   A single precision number to reduce in place.
 ;  * m   The single precision number representing the modulus.
 ;  * mp  The digit representing the negative inverse of m mod 2^n.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_384_mont_reduce_order_6 PROC
         push	r12
         push	r13
@@ -59310,7 +59328,7 @@ ENDIF
         pop	r12
         ret
 sp_384_mont_reduce_order_6 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Compare a with b in constant time.
 ;  *
 ;  * a  A single precision integer.
@@ -59318,7 +59336,7 @@ _text ENDS
 ;  * return -ve, 0 or +ve if a is less than, equal to or greater than b
 ;  * respectively.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_384_cmp_6 PROC
         push	r12
         xor	r9, r9
@@ -59377,7 +59395,7 @@ sp_384_cmp_6 PROC
         pop	r12
         ret
 sp_384_cmp_6 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Add two Montgomery form numbers (r = a + b % m).
 ;  *
 ;  * r   Result of addition.
@@ -59385,7 +59403,7 @@ _text ENDS
 ;  * b   Second number to add in Montgomery form.
 ;  * m   Modulus (prime).
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_384_mont_add_6 PROC
         push	r12
         push	r13
@@ -59439,14 +59457,14 @@ sp_384_mont_add_6 PROC
         pop	r12
         ret
 sp_384_mont_add_6 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Double a Montgomery form number (r = a + a % m).
 ;  *
 ;  * r   Result of doubling.
 ;  * a   Number to double in Montgomery form.
 ;  * m   Modulus (prime).
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_384_mont_dbl_6 PROC
         push	r12
         push	r13
@@ -59501,14 +59519,14 @@ sp_384_mont_dbl_6 PROC
         pop	r12
         ret
 sp_384_mont_dbl_6 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Double a Montgomery form number (r = a + a % m).
 ;  *
 ;  * r   Result of doubling.
 ;  * a   Number to double in Montgomery form.
 ;  * m   Modulus (prime).
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_384_mont_tpl_6 PROC
         push	r12
         push	r13
@@ -59591,7 +59609,7 @@ sp_384_mont_tpl_6 PROC
         pop	r12
         ret
 sp_384_mont_tpl_6 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Subtract two Montgomery form numbers (r = a - b % m).
 ;  *
 ;  * r   Result of subtration.
@@ -59599,7 +59617,7 @@ _text ENDS
 ;  * b   Number to subtract with in Montgomery form.
 ;  * m   Modulus (prime).
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_384_mont_sub_6 PROC
         push	r12
         push	r13
@@ -59653,14 +59671,14 @@ sp_384_mont_sub_6 PROC
         pop	r12
         ret
 sp_384_mont_sub_6 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Divide the number by 2 mod the modulus (prime). (r = a / 2 % m)
 ;  *
 ;  * r  Result of division by 2.
 ;  * a  Number to divide.
 ;  * m  Modulus (prime).
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_384_mont_div2_6 PROC
         push	r12
         push	r13
@@ -59723,7 +59741,7 @@ sp_384_mont_div2_6 PROC
         pop	r12
         ret
 sp_384_mont_div2_6 ENDP
-_text ENDS
+_TEXT ENDS
 IFNDEF WC_NO_CACHE_RESISTANT
 ; /* Touch each possible point that could be being copied.
 ;  *
@@ -59731,7 +59749,7 @@ IFNDEF WC_NO_CACHE_RESISTANT
 ;  * table  Table - start of the entries to access
 ;  * idx    Index of point to retrieve.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_384_get_point_33_6 PROC
         sub	rsp, 160
         movdqu	OWORD PTR [rsp], xmm6
@@ -59834,7 +59852,7 @@ L_384_get_point_33_6_start_2:
         add	rsp, 160
         ret
 sp_384_get_point_33_6 ENDP
-_text ENDS
+_TEXT ENDS
 IFDEF HAVE_INTEL_AVX2
 ; /* Touch each possible point that could be being copied.
 ;  *
@@ -59842,7 +59860,7 @@ IFDEF HAVE_INTEL_AVX2
 ;  * table  Table - start of the entries to access
 ;  * idx    Index of point to retrieve.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_384_get_point_33_avx2_6 PROC
         sub	rsp, 160
         vmovdqu	OWORD PTR [rsp], xmm6
@@ -59913,7 +59931,7 @@ L_384_get_point_33_avx2_6_start:
         add	rsp, 160
         ret
 sp_384_get_point_33_avx2_6 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 ENDIF
 IFDEF HAVE_INTEL_AVX2
@@ -59923,7 +59941,7 @@ IFDEF HAVE_INTEL_AVX2
 ;  * m   The single precision number representing the modulus.
 ;  * mp  The digit representing the negative inverse of m mod 2^n.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_384_mont_reduce_order_avx2_6 PROC
         push	r12
         push	r13
@@ -60237,7 +60255,7 @@ L_mont_loop_order_avx2_6:
         pop	r12
         ret
 sp_384_mont_reduce_order_avx2_6 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 IFDEF HAVE_INTEL_AVX2
 ; /* Conditionally subtract b from a using the mask m.
@@ -60248,7 +60266,7 @@ IFDEF HAVE_INTEL_AVX2
 ;  * b  A single precision number to subtract.
 ;  * m  Mask value to apply.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_384_cond_sub_avx2_6 PROC
         push	r12
         mov	r12, QWORD PTR [r8]
@@ -60285,7 +60303,7 @@ sp_384_cond_sub_avx2_6 PROC
         pop	r12
         ret
 sp_384_cond_sub_avx2_6 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 IFDEF HAVE_INTEL_AVX2
 ; /* Divide the number by 2 mod the modulus (prime). (r = a / 2 % m)
@@ -60294,7 +60312,7 @@ IFDEF HAVE_INTEL_AVX2
 ;  * a  Number to divide.
 ;  * m  Modulus (prime).
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_384_mont_div2_avx2_6 PROC
         push	r12
         push	r13
@@ -60356,7 +60374,7 @@ sp_384_mont_div2_avx2_6 PROC
         pop	r12
         ret
 sp_384_mont_div2_avx2_6 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 IFNDEF WC_NO_CACHE_RESISTANT
 ; /* Touch each possible entry that could be being copied.
@@ -60365,7 +60383,7 @@ IFNDEF WC_NO_CACHE_RESISTANT
 ;  * table  Table - start of the entries to access
 ;  * idx    Index of entry to retrieve.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_384_get_entry_64_6 PROC
         sub	rsp, 160
         movdqu	OWORD PTR [rsp], xmm6
@@ -60438,7 +60456,7 @@ L_384_get_entry_64_6_start_0:
         add	rsp, 160
         ret
 sp_384_get_entry_64_6 ENDP
-_text ENDS
+_TEXT ENDS
 IFDEF HAVE_INTEL_AVX2
 ; /* Touch each possible entry that could be being copied.
 ;  *
@@ -60446,7 +60464,7 @@ IFDEF HAVE_INTEL_AVX2
 ;  * table  Table - start of the entries to access
 ;  * idx    Index of entry to retrieve.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_384_get_entry_64_avx2_6 PROC
         sub	rsp, 96
         vmovdqu	OWORD PTR [rsp], xmm6
@@ -60499,7 +60517,7 @@ L_384_get_entry_64_avx2_6_start:
         add	rsp, 96
         ret
 sp_384_get_entry_64_avx2_6 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 ENDIF
 IFNDEF WC_NO_CACHE_RESISTANT
@@ -60509,7 +60527,7 @@ IFNDEF WC_NO_CACHE_RESISTANT
 ;  * table  Table - start of the entries to access
 ;  * idx    Index of entry to retrieve.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_384_get_entry_65_6 PROC
         sub	rsp, 160
         movdqu	OWORD PTR [rsp], xmm6
@@ -60582,7 +60600,7 @@ L_384_get_entry_65_6_start_0:
         add	rsp, 160
         ret
 sp_384_get_entry_65_6 ENDP
-_text ENDS
+_TEXT ENDS
 IFDEF HAVE_INTEL_AVX2
 ; /* Touch each possible entry that could be being copied.
 ;  *
@@ -60590,7 +60608,7 @@ IFDEF HAVE_INTEL_AVX2
 ;  * table  Table - start of the entries to access
 ;  * idx    Index of entry to retrieve.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_384_get_entry_65_avx2_6 PROC
         sub	rsp, 96
         vmovdqu	OWORD PTR [rsp], xmm6
@@ -60643,14 +60661,14 @@ L_384_get_entry_65_avx2_6_start:
         add	rsp, 96
         ret
 sp_384_get_entry_65_avx2_6 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 ENDIF
 ; /* Add 1 to a. (a = a + 1)
 ;  *
 ;  * a  A single precision integer.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_384_add_one_6 PROC
         add	QWORD PTR [rcx], 1
         adc	QWORD PTR [rcx+8], 0
@@ -60660,7 +60678,7 @@ sp_384_add_one_6 PROC
         adc	QWORD PTR [rcx+40], 0
         ret
 sp_384_add_one_6 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Read big endian unsigned byte array into r.
 ;  * Uses the bswap instruction.
 ;  *
@@ -60669,7 +60687,7 @@ _text ENDS
 ;  * a  Byte array.
 ;  * n  Number of bytes in array to read.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_384_from_bin_bswap PROC
         push	r12
         push	r13
@@ -60747,7 +60765,7 @@ L_384_from_bin_bswap_zero_end:
         pop	r12
         ret
 sp_384_from_bin_bswap ENDP
-_text ENDS
+_TEXT ENDS
 IFNDEF NO_MOVBE_SUPPORT
 ; /* Read big endian unsigned byte array into r.
 ;  * Uses the movbe instruction which is an optional instruction.
@@ -60757,7 +60775,7 @@ IFNDEF NO_MOVBE_SUPPORT
 ;  * a  Byte array.
 ;  * n  Number of bytes in array to read.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_384_from_bin_movbe PROC
         push	r12
         mov	r11, r8
@@ -60823,7 +60841,7 @@ L_384_from_bin_movbe_zero_end:
         pop	r12
         ret
 sp_384_from_bin_movbe ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 ; /* Write r as big endian to byte array.
 ;  * Fixed length number of bytes written: 48
@@ -60832,7 +60850,7 @@ ENDIF
 ;  * r  A single precision integer.
 ;  * a  Byte array.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_384_to_bin_bswap_6 PROC
         mov	rax, QWORD PTR [rcx+40]
         mov	r8, QWORD PTR [rcx+32]
@@ -60854,7 +60872,7 @@ sp_384_to_bin_bswap_6 PROC
         mov	QWORD PTR [rdx+40], r8
         ret
 sp_384_to_bin_bswap_6 ENDP
-_text ENDS
+_TEXT ENDS
 IFNDEF NO_MOVBE_SUPPORT
 ; /* Write r as big endian to byte array.
 ;  * Fixed length number of bytes written: 48
@@ -60863,7 +60881,7 @@ IFNDEF NO_MOVBE_SUPPORT
 ;  * r  A single precision integer.
 ;  * a  Byte array.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_384_to_bin_movbe_6 PROC
         movbe	rax, QWORD PTR [rcx+40]
         movbe	r8, QWORD PTR [rcx+32]
@@ -60879,14 +60897,14 @@ sp_384_to_bin_movbe_6 PROC
         mov	QWORD PTR [rdx+40], r8
         ret
 sp_384_to_bin_movbe_6 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 ; /* Sub b from a into a. (a -= b)
 ;  *
 ;  * a  A single precision integer and result.
 ;  * b  A single precision integer.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_384_sub_in_place_6 PROC
         push	r12
         push	r13
@@ -60907,14 +60925,14 @@ sp_384_sub_in_place_6 PROC
         pop	r12
         ret
 sp_384_sub_in_place_6 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Mul a by digit b into r. (r = a * b)
 ;  *
 ;  * r  A single precision integer.
 ;  * a  A single precision integer.
 ;  * b  A single precision digit.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_384_mul_d_6 PROC
         push	r12
         mov	r9, rdx
@@ -60967,7 +60985,7 @@ sp_384_mul_d_6 PROC
         pop	r12
         ret
 sp_384_mul_d_6 ENDP
-_text ENDS
+_TEXT ENDS
 IFDEF HAVE_INTEL_AVX2
 ; /* Mul a by digit b into r. (r = a * b)
 ;  *
@@ -60975,7 +60993,7 @@ IFDEF HAVE_INTEL_AVX2
 ;  * a  A single precision integer.
 ;  * b  A single precision digit.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_384_mul_d_avx2_6 PROC
         push	r12
         push	r13
@@ -61021,7 +61039,7 @@ sp_384_mul_d_avx2_6 PROC
         pop	r12
         ret
 sp_384_mul_d_avx2_6 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 IFDEF _WIN64
 ; /* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div)
@@ -61031,7 +61049,7 @@ IFDEF _WIN64
 ;  * div  The dividend.
 ;  * returns the result of the division.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 div_384_word_asm_6 PROC
         mov	r9, rdx
         mov	rax, r9
@@ -61039,14 +61057,14 @@ div_384_word_asm_6 PROC
         div	r8
         ret
 div_384_word_asm_6 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 ; /* Shift number right by 1 bit. (r = a >> 1)
 ;  *
 ;  * r  Result of right shift by 1.
 ;  * a  Number to shift.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_384_rshift1_6 PROC
         push	r12
         mov	rax, QWORD PTR [rdx]
@@ -61070,14 +61088,14 @@ sp_384_rshift1_6 PROC
         pop	r12
         ret
 sp_384_rshift1_6 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Divide the number by 2 mod the prime. (r = a / 2 % m)
 ;  *
 ;  * r  Result of division by 2.
 ;  * a  Number to divide.
 ;  * m  Modulus
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_384_div2_mod_6 PROC
         push	r12
         push	r13
@@ -61133,8 +61151,8 @@ L_384_mod_inv_6_div2_mod_no_add:
         pop	r12
         ret
 sp_384_div2_mod_6 ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
 sp_384_num_bits_6 PROC
         xor	rax, rax
         mov	rdx, QWORD PTR [rcx+40]
@@ -61188,7 +61206,7 @@ L_384_num_bits_6_end_0:
 L_384_num_bits_6_done:
         ret
 sp_384_num_bits_6 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 IFDEF WOLFSSL_SP_521
 ; /* Multiply a and b into r. (r = a * b)
@@ -61197,7 +61215,7 @@ IFDEF WOLFSSL_SP_521
 ;  * a  A single precision integer.
 ;  * b  A single precision integer.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_521_mul_9 PROC
         push	r12
         mov	r9, rdx
@@ -61741,7 +61759,7 @@ sp_521_mul_9 PROC
         pop	r12
         ret
 sp_521_mul_9 ENDP
-_text ENDS
+_TEXT ENDS
 IFDEF HAVE_INTEL_AVX2
 ; /* Multiply a and b into r. (r = a * b)
 ;  *
@@ -61749,7 +61767,7 @@ IFDEF HAVE_INTEL_AVX2
 ;  * a   First number to multiply.
 ;  * b   Second number to multiply.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_521_mul_avx2_9 PROC
         push	rbx
         push	rbp
@@ -62319,14 +62337,14 @@ L_end_521_mul_avx2_9:
         pop	rbx
         ret
 sp_521_mul_avx2_9 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 ; /* Square a and put result in r. (r = a * a)
 ;  *
 ;  * r  A single precision integer.
 ;  * a  A single precision integer.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_521_sqr_9 PROC
         push	r12
         push	r13
@@ -62736,14 +62754,14 @@ sp_521_sqr_9 PROC
         pop	r12
         ret
 sp_521_sqr_9 ENDP
-_text ENDS
+_TEXT ENDS
 IFDEF HAVE_INTEL_AVX2
 ; /* Square a and put result in r. (r = a * a)
 ;  *
 ;  * r  A single precision integer.
 ;  * a  A single precision integer.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_521_sqr_avx2_9 PROC
         push	rbp
         push	r12
@@ -63134,7 +63152,7 @@ L_end_521_sqr_avx2_9:
         pop	rbp
         ret
 sp_521_sqr_avx2_9 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 ; /* Add b to a into r. (r = a + b)
 ;  *
@@ -63142,7 +63160,7 @@ ENDIF
 ;  * a  A single precision integer.
 ;  * b  A single precision integer.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_521_add_9 PROC
         ; Add
         mov	r9, QWORD PTR [rdx]
@@ -63176,14 +63194,14 @@ sp_521_add_9 PROC
         adc	rax, 0
         ret
 sp_521_add_9 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Sub b from a into r. (r = a - b)
 ;  *
 ;  * r  A single precision integer.
 ;  * a  A single precision integer.
 ;  * b  A single precision integer.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_521_sub_9 PROC
         mov	r9, QWORD PTR [rdx]
         sub	r9, QWORD PTR [r8]
@@ -63215,7 +63233,7 @@ sp_521_sub_9 PROC
         sbb	rax, rax
         ret
 sp_521_sub_9 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Conditionally copy a into r using the mask m.
 ;  * m is -1 to copy and 0 when not.
 ;  *
@@ -63223,7 +63241,7 @@ _text ENDS
 ;  * a  A single precision number to copy.
 ;  * m  Mask value to apply.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_521_cond_copy_9 PROC
         push	r12
         mov	rax, QWORD PTR [rcx]
@@ -63265,7 +63283,7 @@ sp_521_cond_copy_9 PROC
         pop	r12
         ret
 sp_521_cond_copy_9 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Multiply two Montgomery form numbers mod the modulus (prime).
 ;  * (r = a * b mod m)
 ;  *
@@ -63275,7 +63293,7 @@ _text ENDS
 ;  * m   Modulus (prime).
 ;  * mp  Montgomery multiplier.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_521_mont_mul_9 PROC
         push	r12
         push	r13
@@ -63857,7 +63875,7 @@ sp_521_mont_mul_9 PROC
         pop	r12
         ret
 sp_521_mont_mul_9 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Square the Montgomery form number mod the modulus (prime). (r = a * a mod m)
 ;  *
 ;  * r   Result of squaring.
@@ -63865,7 +63883,7 @@ _text ENDS
 ;  * m   Modulus (prime).
 ;  * mp  Montgomery multiplier.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_521_mont_sqr_9 PROC
         push	r12
         push	r13
@@ -64309,7 +64327,7 @@ sp_521_mont_sqr_9 PROC
         pop	r12
         ret
 sp_521_mont_sqr_9 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Compare a with b in constant time.
 ;  *
 ;  * a  A single precision integer.
@@ -64317,7 +64335,7 @@ _text ENDS
 ;  * return -ve, 0 or +ve if a is less than, equal to or greater than b
 ;  * respectively.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_521_cmp_9 PROC
         push	r12
         xor	r9, r9
@@ -64400,7 +64418,7 @@ sp_521_cmp_9 PROC
         pop	r12
         ret
 sp_521_cmp_9 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Conditionally subtract b from a using the mask m.
 ;  * m is -1 to subtract and 0 when not copying.
 ;  *
@@ -64409,7 +64427,7 @@ _text ENDS
 ;  * b  A single precision number to subtract.
 ;  * m  Mask value to apply.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_521_cond_sub_9 PROC
         sub	rsp, 72
         mov	r10, QWORD PTR [r8]
@@ -64479,14 +64497,14 @@ sp_521_cond_sub_9 PROC
         add	rsp, 72
         ret
 sp_521_cond_sub_9 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Reduce the number back to 521 bits using Montgomery reduction.
 ;  *
 ;  * a   A single precision number to reduce in place.
 ;  * m   The single precision number representing the modulus.
 ;  * mp  The digit representing the negative inverse of m mod 2^n.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_521_mont_reduce_9 PROC
         push	r12
         push	r13
@@ -64548,14 +64566,14 @@ sp_521_mont_reduce_9 PROC
         pop	r12
         ret
 sp_521_mont_reduce_9 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Reduce the number back to 521 bits using Montgomery reduction.
 ;  *
 ;  * a   A single precision number to reduce in place.
 ;  * m   The single precision number representing the modulus.
 ;  * mp  The digit representing the negative inverse of m mod 2^n.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_521_mont_reduce_order_9 PROC
         push	r12
         push	r13
@@ -64723,7 +64741,7 @@ ENDIF
         pop	r12
         ret
 sp_521_mont_reduce_order_9 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Add two Montgomery form numbers (r = a + b % m).
 ;  *
 ;  * r   Result of addition.
@@ -64731,7 +64749,7 @@ _text ENDS
 ;  * b   Second number to add in Montgomery form.
 ;  * m   Modulus (prime).
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_521_mont_add_9 PROC
         push	r12
         push	r13
@@ -64786,14 +64804,14 @@ sp_521_mont_add_9 PROC
         pop	r12
         ret
 sp_521_mont_add_9 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Double a Montgomery form number (r = a + a % m).
 ;  *
 ;  * r   Result of addition.
 ;  * a   Number to double in Montgomery form.
 ;  * m   Modulus (prime).
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_521_mont_dbl_9 PROC
         push	r12
         push	r13
@@ -64846,14 +64864,14 @@ sp_521_mont_dbl_9 PROC
         pop	r12
         ret
 sp_521_mont_dbl_9 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Triple a Montgomery form number (r = a + a + a % m).
 ;  *
 ;  * r   Result of Tripling.
 ;  * a   Number to triple in Montgomery form.
 ;  * m   Modulus (prime).
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_521_mont_tpl_9 PROC
         push	r12
         push	r13
@@ -64915,7 +64933,7 @@ sp_521_mont_tpl_9 PROC
         pop	r12
         ret
 sp_521_mont_tpl_9 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Subtract two Montgomery form numbers (r = a - b % m).
 ;  *
 ;  * r   Result of addition.
@@ -64923,7 +64941,7 @@ _text ENDS
 ;  * b   Second number to add in Montgomery form.
 ;  * m   Modulus (prime).
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_521_mont_sub_9 PROC
         push	r12
         push	r13
@@ -64979,14 +64997,14 @@ sp_521_mont_sub_9 PROC
         pop	r12
         ret
 sp_521_mont_sub_9 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Divide the number by 2 mod the modulus (prime). (r = a / 2 % m)
 ;  *
 ;  * r  Result of division by 2.
 ;  * a  Number to divide.
 ;  * m  Modulus (prime).
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_521_mont_div2_9 PROC
         push	r12
         push	r13
@@ -65040,7 +65058,7 @@ sp_521_mont_div2_9 PROC
         pop	r12
         ret
 sp_521_mont_div2_9 ENDP
-_text ENDS
+_TEXT ENDS
 IFNDEF WC_NO_CACHE_RESISTANT
 ; /* Touch each possible point that could be being copied.
 ;  *
@@ -65048,7 +65066,7 @@ IFNDEF WC_NO_CACHE_RESISTANT
 ;  * table  Table - start of the entries to access
 ;  * idx    Index of point to retrieve.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_521_get_point_33_9 PROC
         push	r12
         push	r13
@@ -65200,7 +65218,7 @@ L_521_get_point_33_9_start_2:
         pop	r12
         ret
 sp_521_get_point_33_9 ENDP
-_text ENDS
+_TEXT ENDS
 IFDEF HAVE_INTEL_AVX2
 ; /* Touch each possible point that could be being copied.
 ;  *
@@ -65208,7 +65226,7 @@ IFDEF HAVE_INTEL_AVX2
 ;  * table  Table - start of the entries to access
 ;  * idx    Index of point to retrieve.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_521_get_point_33_avx2_9 PROC
         push	r12
         push	r13
@@ -65310,7 +65328,7 @@ L_521_get_point_33_avx2_9_start:
         pop	r12
         ret
 sp_521_get_point_33_avx2_9 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 ENDIF
 IFDEF HAVE_INTEL_AVX2
@@ -65323,7 +65341,7 @@ IFDEF HAVE_INTEL_AVX2
 ;  * m   Modulus (prime).
 ;  * mp  Montgomery multiplier.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_521_mont_mul_avx2_9 PROC
         push	rbx
         push	rbp
@@ -65923,7 +65941,7 @@ sp_521_mont_mul_avx2_9 PROC
         pop	rbx
         ret
 sp_521_mont_mul_avx2_9 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 IFDEF HAVE_INTEL_AVX2
 ; /* Square the Montgomery form number mod the modulus (prime). (r = a * a mod m)
@@ -65933,7 +65951,7 @@ IFDEF HAVE_INTEL_AVX2
 ;  * m   Modulus (prime).
 ;  * mp  Montgomery multiplier.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_521_mont_sqr_avx2_9 PROC
         push	rbp
         push	r12
@@ -66365,7 +66383,7 @@ sp_521_mont_sqr_avx2_9 PROC
         pop	rbp
         ret
 sp_521_mont_sqr_avx2_9 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 IFDEF HAVE_INTEL_AVX2
 ; /* Conditionally subtract b from a using the mask m.
@@ -66376,7 +66394,7 @@ IFDEF HAVE_INTEL_AVX2
 ;  * b  A single precision number to subtract.
 ;  * m  Mask value to apply.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_521_cond_sub_avx2_9 PROC
         push	r12
         mov	r12, QWORD PTR [r8]
@@ -66428,7 +66446,7 @@ sp_521_cond_sub_avx2_9 PROC
         pop	r12
         ret
 sp_521_cond_sub_avx2_9 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 IFDEF HAVE_INTEL_AVX2
 ; /* Reduce the number back to 521 bits using Montgomery reduction.
@@ -66437,7 +66455,7 @@ IFDEF HAVE_INTEL_AVX2
 ;  * m   The single precision number representing the modulus.
 ;  * mp  The digit representing the negative inverse of m mod 2^n.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_521_mont_reduce_order_avx2_9 PROC
         push	r12
         push	r13
@@ -66741,7 +66759,7 @@ L_521_mont_reduce_order_avx2_9_loop:
         pop	r12
         ret
 sp_521_mont_reduce_order_avx2_9 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 IFDEF HAVE_INTEL_AVX2
 ; /* Divide the number by 2 mod the modulus (prime). (r = a / 2 % m)
@@ -66750,7 +66768,7 @@ IFDEF HAVE_INTEL_AVX2
 ;  * a  Number to divide.
 ;  * m  Modulus (prime).
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_521_mont_div2_avx2_9 PROC
         push	r12
         push	r13
@@ -66804,7 +66822,7 @@ sp_521_mont_div2_avx2_9 PROC
         pop	r12
         ret
 sp_521_mont_div2_avx2_9 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 IFNDEF WC_NO_CACHE_RESISTANT
 ; /* Touch each possible entry that could be being copied.
@@ -66813,7 +66831,7 @@ IFNDEF WC_NO_CACHE_RESISTANT
 ;  * table  Table - start of the entries to access
 ;  * idx    Index of entry to retrieve.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_521_get_entry_64_9 PROC
         push	r12
         sub	rsp, 160
@@ -66937,7 +66955,7 @@ L_521_get_entry_64_9_start_1:
         pop	r12
         ret
 sp_521_get_entry_64_9 ENDP
-_text ENDS
+_TEXT ENDS
 IFDEF HAVE_INTEL_AVX2
 ; /* Touch each possible entry that could be being copied.
 ;  *
@@ -66945,7 +66963,7 @@ IFDEF HAVE_INTEL_AVX2
 ;  * table  Table - start of the entries to access
 ;  * idx    Index of entry to retrieve.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_521_get_entry_64_avx2_9 PROC
         push	r12
         push	r13
@@ -67020,7 +67038,7 @@ L_521_get_entry_64_avx2_9_start:
         pop	r12
         ret
 sp_521_get_entry_64_avx2_9 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 ENDIF
 IFNDEF WC_NO_CACHE_RESISTANT
@@ -67030,7 +67048,7 @@ IFNDEF WC_NO_CACHE_RESISTANT
 ;  * table  Table - start of the entries to access
 ;  * idx    Index of entry to retrieve.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_521_get_entry_65_9 PROC
         push	r12
         sub	rsp, 160
@@ -67154,7 +67172,7 @@ L_521_get_entry_65_9_start_1:
         pop	r12
         ret
 sp_521_get_entry_65_9 ENDP
-_text ENDS
+_TEXT ENDS
 IFDEF HAVE_INTEL_AVX2
 ; /* Touch each possible entry that could be being copied.
 ;  *
@@ -67162,7 +67180,7 @@ IFDEF HAVE_INTEL_AVX2
 ;  * table  Table - start of the entries to access
 ;  * idx    Index of entry to retrieve.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_521_get_entry_65_avx2_9 PROC
         push	r12
         push	r13
@@ -67237,14 +67255,14 @@ L_521_get_entry_65_avx2_9_start:
         pop	r12
         ret
 sp_521_get_entry_65_avx2_9 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 ENDIF
 ; /* Add 1 to a. (a = a + 1)
 ;  *
 ;  * a  A single precision integer.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_521_add_one_9 PROC
         add	QWORD PTR [rcx], 1
         adc	QWORD PTR [rcx+8], 0
@@ -67257,7 +67275,7 @@ sp_521_add_one_9 PROC
         adc	QWORD PTR [rcx+64], 0
         ret
 sp_521_add_one_9 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Read big endian unsigned byte array into r.
 ;  * Uses the bswap instruction.
 ;  *
@@ -67266,7 +67284,7 @@ _text ENDS
 ;  * a  Byte array.
 ;  * n  Number of bytes in array to read.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_521_from_bin_bswap PROC
         push	r12
         push	r13
@@ -67344,7 +67362,7 @@ L_521_from_bin_bswap_zero_end:
         pop	r12
         ret
 sp_521_from_bin_bswap ENDP
-_text ENDS
+_TEXT ENDS
 IFNDEF NO_MOVBE_SUPPORT
 ; /* Read big endian unsigned byte array into r.
 ;  * Uses the movbe instruction which is an optional instruction.
@@ -67354,7 +67372,7 @@ IFNDEF NO_MOVBE_SUPPORT
 ;  * a  Byte array.
 ;  * n  Number of bytes in array to read.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_521_from_bin_movbe PROC
         push	r12
         mov	r11, r8
@@ -67420,7 +67438,7 @@ L_521_from_bin_movbe_zero_end:
         pop	r12
         ret
 sp_521_from_bin_movbe ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 ; /* Write r as big endian to byte array.
 ;  * Fixed length number of bytes written: 65
@@ -67429,7 +67447,7 @@ ENDIF
 ;  * r  A single precision integer.
 ;  * a  Byte array.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_521_to_bin_bswap_9 PROC
         mov	r8b, BYTE PTR [rcx+64]
         mov	al, BYTE PTR [rcx+65]
@@ -67461,7 +67479,7 @@ sp_521_to_bin_bswap_9 PROC
         mov	QWORD PTR [rdx+58], r8
         ret
 sp_521_to_bin_bswap_9 ENDP
-_text ENDS
+_TEXT ENDS
 IFNDEF NO_MOVBE_SUPPORT
 ; /* Write r as big endian to byte array.
 ;  * Fixed length number of bytes written: 65
@@ -67470,7 +67488,7 @@ IFNDEF NO_MOVBE_SUPPORT
 ;  * r  A single precision integer.
 ;  * a  Byte array.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_521_to_bin_movbe_9 PROC
         mov	r8b, BYTE PTR [rcx+64]
         mov	al, BYTE PTR [rcx+65]
@@ -67494,14 +67512,14 @@ sp_521_to_bin_movbe_9 PROC
         mov	QWORD PTR [rdx+58], r8
         ret
 sp_521_to_bin_movbe_9 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 ; /* Shift number right by 1 bit. (r = a >> 1)
 ;  *
 ;  * r  Result of right shift by 1.
 ;  * a  Number to shift.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_521_rshift_9 PROC
         push	r12
         mov	rax, rcx
@@ -67536,14 +67554,14 @@ sp_521_rshift_9 PROC
         pop	r12
         ret
 sp_521_rshift_9 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Shift number left by n bit. (r = a << n)
 ;  *
 ;  * r  Result of left shift by n.
 ;  * a  Number to shift.
 ;  * n  Amoutnt o shift.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_521_lshift_9 PROC
         push	r12
         push	r13
@@ -67583,14 +67601,14 @@ sp_521_lshift_9 PROC
         pop	r12
         ret
 sp_521_lshift_9 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Shift number left by n bit. (r = a << n)
 ;  *
 ;  * r  Result of left shift by n.
 ;  * a  Number to shift.
 ;  * n  Amoutnt o shift.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_521_lshift_18 PROC
         push	r12
         push	r13
@@ -67657,13 +67675,13 @@ sp_521_lshift_18 PROC
         pop	r12
         ret
 sp_521_lshift_18 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Sub b from a into a. (a -= b)
 ;  *
 ;  * a  A single precision integer and result.
 ;  * b  A single precision integer.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_521_sub_in_place_9 PROC
         mov	r8, QWORD PTR [rcx]
         sub	r8, QWORD PTR [rdx]
@@ -67695,14 +67713,14 @@ sp_521_sub_in_place_9 PROC
         sbb	rax, rax
         ret
 sp_521_sub_in_place_9 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Mul a by digit b into r. (r = a * b)
 ;  *
 ;  * r  A single precision integer.
 ;  * a  A single precision integer.
 ;  * b  A single precision digit.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_521_mul_d_9 PROC
         push	r12
         mov	r9, rdx
@@ -67779,7 +67797,7 @@ sp_521_mul_d_9 PROC
         pop	r12
         ret
 sp_521_mul_d_9 ENDP
-_text ENDS
+_TEXT ENDS
 IFDEF HAVE_INTEL_AVX2
 ; /* Mul a by digit b into r. (r = a * b)
 ;  *
@@ -67787,7 +67805,7 @@ IFDEF HAVE_INTEL_AVX2
 ;  * a  A single precision integer.
 ;  * b  A single precision digit.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_521_mul_d_avx2_9 PROC
         push	r12
         push	r13
@@ -67851,7 +67869,7 @@ sp_521_mul_d_avx2_9 PROC
         pop	r12
         ret
 sp_521_mul_d_avx2_9 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 IFDEF _WIN64
 ; /* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div)
@@ -67861,7 +67879,7 @@ IFDEF _WIN64
 ;  * div  The dividend.
 ;  * returns the result of the division.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 div_521_word_asm_9 PROC
         mov	r9, rdx
         mov	rax, r9
@@ -67869,14 +67887,14 @@ div_521_word_asm_9 PROC
         div	r8
         ret
 div_521_word_asm_9 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 ; /* Shift number right by 1 bit. (r = a >> 1)
 ;  *
 ;  * r  Result of right shift by 1.
 ;  * a  Number to shift.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_521_rshift1_9 PROC
         push	r12
         mov	rax, QWORD PTR [rdx]
@@ -67909,14 +67927,14 @@ sp_521_rshift1_9 PROC
         pop	r12
         ret
 sp_521_rshift1_9 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Divide the number by 2 mod the prime. (r = a / 2 % m)
 ;  *
 ;  * r  Result of division by 2.
 ;  * a  Number to divide.
 ;  * m  Modulus
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_521_div2_mod_9 PROC
         push	r12
         mov	rax, QWORD PTR [rdx]
@@ -67989,8 +68007,8 @@ L_521_mod_inv_9_div2_mod_no_add:
         pop	r12
         ret
 sp_521_div2_mod_9 ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
 sp_521_num_bits_9 PROC
         xor	rax, rax
         mov	rdx, QWORD PTR [rcx+64]
@@ -68068,7 +68086,7 @@ L_521_num_bits_9_end_0:
 L_521_num_bits_9_done:
         ret
 sp_521_num_bits_9 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 IFDEF WOLFSSL_SP_1024
 ; /* Multiply a and b into r. (r = a * b)
@@ -68077,7 +68095,7 @@ IFDEF WOLFSSL_SP_1024
 ;  * a  A single precision integer.
 ;  * b  A single precision integer.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_1024_mul_16 PROC
         push	r12
         mov	r9, rdx
@@ -69713,13 +69731,13 @@ sp_1024_mul_16 PROC
         pop	r12
         ret
 sp_1024_mul_16 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Square a and put result in r. (r = a * a)
 ;  *
 ;  * r  A single precision integer.
 ;  * a  A single precision integer.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_1024_sqr_16 PROC
         push	r12
         push	r13
@@ -70801,7 +70819,7 @@ sp_1024_sqr_16 PROC
         pop	r12
         ret
 sp_1024_sqr_16 ENDP
-_text ENDS
+_TEXT ENDS
 IFDEF HAVE_INTEL_AVX2
 ; /* Multiply a and b into r. (r = a * b)
 ;  *
@@ -70809,7 +70827,7 @@ IFDEF HAVE_INTEL_AVX2
 ;  * a   First number to multiply.
 ;  * b   Second number to multiply.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_1024_mul_avx2_16 PROC
         push	rbx
         push	rbp
@@ -72472,7 +72490,7 @@ L_end_1024_mul_avx2_16:
         pop	rbx
         ret
 sp_1024_mul_avx2_16 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 IFDEF HAVE_INTEL_AVX2
 ; /* Square a and put result in r. (r = a * a)
@@ -72480,7 +72498,7 @@ IFDEF HAVE_INTEL_AVX2
 ;  * r  A single precision integer.
 ;  * a  A single precision integer.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_1024_sqr_avx2_16 PROC
         push	rbp
         push	r12
@@ -73526,7 +73544,7 @@ L_end_1024_sqr_avx2_16:
         pop	rbp
         ret
 sp_1024_sqr_avx2_16 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 ; /* Add b to a into r. (r = a + b)
 ;  *
@@ -73534,7 +73552,7 @@ ENDIF
 ;  * a  A single precision integer.
 ;  * b  A single precision integer.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_1024_add_16 PROC
         ; Add
         mov	r9, QWORD PTR [rdx]
@@ -73589,13 +73607,13 @@ sp_1024_add_16 PROC
         adc	rax, 0
         ret
 sp_1024_add_16 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Sub b from a into a. (a -= b)
 ;  *
 ;  * a  A single precision integer and result.
 ;  * b  A single precision integer.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_1024_sub_in_place_16 PROC
         mov	r8, QWORD PTR [rcx]
         sub	r8, QWORD PTR [rdx]
@@ -73648,7 +73666,7 @@ sp_1024_sub_in_place_16 PROC
         sbb	rax, rax
         ret
 sp_1024_sub_in_place_16 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Conditionally subtract b from a using the mask m.
 ;  * m is -1 to subtract and 0 when not copying.
 ;  *
@@ -73657,7 +73675,7 @@ _text ENDS
 ;  * b  A single precision number to subtract.
 ;  * m  Mask value to apply.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_1024_cond_sub_16 PROC
         sub	rsp, 128
         mov	r10, QWORD PTR [r8]
@@ -73776,7 +73794,7 @@ sp_1024_cond_sub_16 PROC
         add	rsp, 128
         ret
 sp_1024_cond_sub_16 ENDP
-_text ENDS
+_TEXT ENDS
 IFDEF HAVE_INTEL_AVX2
 ; /* Conditionally subtract b from a using the mask m.
 ;  * m is -1 to subtract and 0 when not copying.
@@ -73786,7 +73804,7 @@ IFDEF HAVE_INTEL_AVX2
 ;  * b  A single precision number to subtract.
 ;  * m  Mask value to apply.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_1024_cond_sub_avx2_16 PROC
         push	r12
         mov	r12, QWORD PTR [r8]
@@ -73873,7 +73891,7 @@ sp_1024_cond_sub_avx2_16 PROC
         pop	r12
         ret
 sp_1024_cond_sub_avx2_16 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 ; /* Mul a by digit b into r. (r = a * b)
 ;  *
@@ -73881,7 +73899,7 @@ ENDIF
 ;  * a  A single precision integer.
 ;  * b  A single precision digit.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_1024_mul_d_16 PROC
         push	r12
         mov	r9, rdx
@@ -74014,7 +74032,7 @@ sp_1024_mul_d_16 PROC
         pop	r12
         ret
 sp_1024_mul_d_16 ENDP
-_text ENDS
+_TEXT ENDS
 IFDEF HAVE_INTEL_AVX2
 ; /* Mul a by digit b into r. (r = a * b)
 ;  *
@@ -74022,7 +74040,7 @@ IFDEF HAVE_INTEL_AVX2
 ;  * a  A single precision integer.
 ;  * b  A single precision digit.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_1024_mul_d_avx2_16 PROC
         push	r12
         push	r13
@@ -74128,7 +74146,7 @@ sp_1024_mul_d_avx2_16 PROC
         pop	r12
         ret
 sp_1024_mul_d_avx2_16 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 IFDEF _WIN64
 ; /* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div)
@@ -74138,7 +74156,7 @@ IFDEF _WIN64
 ;  * div  The dividend.
 ;  * returns the result of the division.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 div_1024_word_asm_16 PROC
         mov	r9, rdx
         mov	rax, r9
@@ -74146,7 +74164,7 @@ div_1024_word_asm_16 PROC
         div	r8
         ret
 div_1024_word_asm_16 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 ; /* Compare a with b in constant time.
 ;  *
@@ -74155,7 +74173,7 @@ ENDIF
 ;  * return -ve, 0 or +ve if a is less than, equal to or greater than b
 ;  * respectively.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_1024_cmp_16 PROC
         push	r12
         xor	r9, r9
@@ -74294,7 +74312,7 @@ sp_1024_cmp_16 PROC
         pop	r12
         ret
 sp_1024_cmp_16 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Conditionally copy a into r using the mask m.
 ;  * m is -1 to copy and 0 when not.
 ;  *
@@ -74302,7 +74320,7 @@ _text ENDS
 ;  * a  A single precision number to copy.
 ;  * m  Mask value to apply.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_1024_cond_copy_16 PROC
         mov	rax, QWORD PTR [rcx]
         mov	r9, QWORD PTR [rcx+8]
@@ -74370,14 +74388,14 @@ sp_1024_cond_copy_16 PROC
         xor	QWORD PTR [rcx+120], r11
         ret
 sp_1024_cond_copy_16 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Reduce the number back to 1024 bits using Montgomery reduction.
 ;  *
 ;  * a   A single precision number to reduce in place.
 ;  * m   The single precision number representing the modulus.
 ;  * mp  The digit representing the negative inverse of m mod 2^n.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_1024_mont_reduce_16 PROC
         push	r12
         push	r13
@@ -74582,7 +74600,7 @@ ENDIF
         pop	r12
         ret
 sp_1024_mont_reduce_16 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Add two Montgomery form numbers (r = a + b % m).
 ;  *
 ;  * r   Result of addition.
@@ -74590,7 +74608,7 @@ _text ENDS
 ;  * b   Second number to add in Montgomery form.
 ;  * m   Modulus (prime).
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_1024_mont_add_16 PROC
         push	r12
         push	r13
@@ -74750,14 +74768,14 @@ sp_1024_mont_add_16 PROC
         pop	r12
         ret
 sp_1024_mont_add_16 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Double a Montgomery form number (r = a + a % m).
 ;  *
 ;  * r   Result of addition.
 ;  * a   Number to double in Montgomery form.
 ;  * m   Modulus (prime).
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_1024_mont_dbl_16 PROC
         push	r12
         sub	rsp, 128
@@ -74915,14 +74933,14 @@ sp_1024_mont_dbl_16 PROC
         pop	r12
         ret
 sp_1024_mont_dbl_16 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Triple a Montgomery form number (r = a + a + a % m).
 ;  *
 ;  * r   Result of addition.
 ;  * a   Number to double in Montgomery form.
 ;  * m   Modulus (prime).
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_1024_mont_tpl_16 PROC
         push	r12
         sub	rsp, 128
@@ -75230,7 +75248,7 @@ sp_1024_mont_tpl_16 PROC
         pop	r12
         ret
 sp_1024_mont_tpl_16 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Subtract two Montgomery form numbers (r = a - b % m).
 ;  *
 ;  * r   Result of addition.
@@ -75238,7 +75256,7 @@ _text ENDS
 ;  * b   Second number to add in Montgomery form.
 ;  * m   Modulus (prime).
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_1024_mont_sub_16 PROC
         push	r12
         push	r13
@@ -75394,14 +75412,14 @@ sp_1024_mont_sub_16 PROC
         pop	r12
         ret
 sp_1024_mont_sub_16 ENDP
-_text ENDS
+_TEXT ENDS
 ; /* Divide the number by 2 mod the modulus (prime). (r = a / 2 % m)
 ;  *
 ;  * r  Result of division by 2.
 ;  * a  Number to divide.
 ;  * m  Modulus (prime).
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_1024_mont_div2_16 PROC
         push	r12
         push	r13
@@ -75544,7 +75562,7 @@ sp_1024_mont_div2_16 PROC
         pop	r12
         ret
 sp_1024_mont_div2_16 ENDP
-_text ENDS
+_TEXT ENDS
 IFDEF HAVE_INTEL_AVX2
 ; /* Reduce the number back to 1024 bits using Montgomery reduction.
 ;  *
@@ -75552,7 +75570,7 @@ IFDEF HAVE_INTEL_AVX2
 ;  * m   The single precision number representing the modulus.
 ;  * mp  The digit representing the negative inverse of m mod 2^n.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_1024_mont_reduce_avx2_16 PROC
         push	r12
         push	r13
@@ -75879,7 +75897,7 @@ L_1024_mont_reduce_avx2_16_loop:
         pop	r12
         ret
 sp_1024_mont_reduce_avx2_16 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 IFDEF HAVE_INTEL_AVX2
 ; /* Add two Montgomery form numbers (r = a + b % m).
@@ -75889,7 +75907,7 @@ IFDEF HAVE_INTEL_AVX2
 ;  * b   Second number to add in Montgomery form.
 ;  * m   Modulus (prime).
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_1024_mont_add_avx2_16 PROC
         push	r12
         push	r13
@@ -76031,7 +76049,7 @@ sp_1024_mont_add_avx2_16 PROC
         pop	r12
         ret
 sp_1024_mont_add_avx2_16 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 IFDEF HAVE_INTEL_AVX2
 ; /* Double a Montgomery form number (r = a + a % m).
@@ -76040,7 +76058,7 @@ IFDEF HAVE_INTEL_AVX2
 ;  * a   Number to double in Montgomery form.
 ;  * m   Modulus (prime).
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_1024_mont_dbl_avx2_16 PROC
         push	r12
         mov	rax, QWORD PTR [rdx]
@@ -76180,7 +76198,7 @@ sp_1024_mont_dbl_avx2_16 PROC
         pop	r12
         ret
 sp_1024_mont_dbl_avx2_16 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 IFDEF HAVE_INTEL_AVX2
 ; /* Triple a Montgomery form number (r = a + a + a % m).
@@ -76189,7 +76207,7 @@ IFDEF HAVE_INTEL_AVX2
 ;  * a   Number to double in Montgomery form.
 ;  * m   Modulus (prime).
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_1024_mont_tpl_avx2_16 PROC
         push	r12
         mov	rax, QWORD PTR [rdx]
@@ -76463,7 +76481,7 @@ sp_1024_mont_tpl_avx2_16 PROC
         pop	r12
         ret
 sp_1024_mont_tpl_avx2_16 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 IFDEF HAVE_INTEL_AVX2
 ; /* Subtract two Montgomery form numbers (r = a - b % m).
@@ -76473,7 +76491,7 @@ IFDEF HAVE_INTEL_AVX2
 ;  * b   Second number to add in Montgomery form.
 ;  * m   Modulus (prime).
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_1024_mont_sub_avx2_16 PROC
         push	r12
         push	r13
@@ -76611,7 +76629,7 @@ sp_1024_mont_sub_avx2_16 PROC
         pop	r12
         ret
 sp_1024_mont_sub_avx2_16 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 IFDEF HAVE_INTEL_AVX2
 ; /* Divide the number by 2 mod the modulus (prime). (r = a / 2 % m)
@@ -76620,7 +76638,7 @@ IFDEF HAVE_INTEL_AVX2
 ;  * a  Number to divide.
 ;  * m  Modulus (prime).
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_1024_mont_div2_avx2_16 PROC
         push	r12
         push	r13
@@ -76762,7 +76780,7 @@ sp_1024_mont_div2_avx2_16 PROC
         pop	r12
         ret
 sp_1024_mont_div2_avx2_16 ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 ; /* Read big endian unsigned byte array into r.
 ;  * Uses the bswap instruction.
@@ -76772,7 +76790,7 @@ ENDIF
 ;  * a  Byte array.
 ;  * n  Number of bytes in array to read.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_1024_from_bin_bswap PROC
         push	r12
         push	r13
@@ -76850,7 +76868,7 @@ L_1024_from_bin_bswap_zero_end:
         pop	r12
         ret
 sp_1024_from_bin_bswap ENDP
-_text ENDS
+_TEXT ENDS
 IFNDEF NO_MOVBE_SUPPORT
 ; /* Read big endian unsigned byte array into r.
 ;  * Uses the movbe instruction which is an optional instruction.
@@ -76860,7 +76878,7 @@ IFNDEF NO_MOVBE_SUPPORT
 ;  * a  Byte array.
 ;  * n  Number of bytes in array to read.
 ;  */
-_text SEGMENT READONLY PARA
+_TEXT SEGMENT READONLY PARA
 sp_1024_from_bin_movbe PROC
         push	r12
         mov	r11, r8
@@ -76926,7 +76944,7 @@ L_1024_from_bin_movbe_zero_end:
         pop	r12
         ret
 sp_1024_from_bin_movbe ENDP
-_text ENDS
+_TEXT ENDS
 ENDIF
 ENDIF
 END
diff --git a/wolfcrypt/src/wc_mldsa_asm.S b/wolfcrypt/src/wc_mldsa_asm.S
index f21bd8ff7fa..52acb659dcd 100644
--- a/wolfcrypt/src/wc_mldsa_asm.S
+++ b/wolfcrypt/src/wc_mldsa_asm.S
@@ -54,6 +54,11 @@
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 mldsa_q:
 .long	0x007fe001,0x007fe001,0x007fe001,0x007fe001
 .long	0x007fe001,0x007fe001,0x007fe001,0x007fe001
@@ -62,6 +67,11 @@ mldsa_q:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 mldsa_qinv:
 .long	0x03802001,0x03802001,0x03802001,0x03802001
 .long	0x03802001,0x03802001,0x03802001,0x03802001
@@ -70,6 +80,11 @@ mldsa_qinv:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 mldsa_v:
 .long	0x00400000,0x00400000,0x00400000,0x00400000
 .long	0x00400000,0x00400000,0x00400000,0x00400000
@@ -78,6 +93,11 @@ mldsa_v:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_mldsa_avx2_zetas:
 .long	0x000064f7,0x000064f7,0x000064f7,0x000064f7
 .long	0x000064f7,0x000064f7,0x000064f7,0x000064f7
@@ -400,6 +420,11 @@ L_mldsa_avx2_zetas:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_mldsa_avx2_zetas_basemul:
 .long	0xffc406e5,0xffe9d65d,0x003cf91b,0x001729a3
 .long	0xffe8ac81,0x003509ee,0x0018537f,0xffcbf612
@@ -470,6 +495,11 @@ L_mldsa_avx2_zetas_basemul:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_mldsa_avx2_zetas_1:
 .long	0xffc97e01,0xffc97e01,0xffc97e01,0xffc97e01
 .long	0xffc97e01,0xffc97e01,0xffc97e01,0xffc97e01
@@ -478,6 +508,11 @@ L_mldsa_avx2_zetas_1:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_mldsa_avx2_zetas_inv:
 .long	0xffe1d632,0x000ce94a,0xffeaa198,0xffc3ea36
 .long	0x0014c921,0x0000bcb2,0xffc430d4,0x000875b0
@@ -16149,9 +16184,9 @@ _wc_mldsa_mul_vec_7_avx2:
 .section	__DATA,__data
 #endif /* __APPLE__ */
 #ifndef __APPLE__
-.align	16
+.align	32
 #else
-.p2align	4
+.p2align	5
 #endif /* __APPLE__ */
 L_mldsa_rej_idx:
 .quad	0x0000000000000000,0x0000000000000000
@@ -16671,6 +16706,11 @@ L_mldsa_rej_idx:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_mldsa_rej_q:
 .long	0x007fe001,0x007fe001,0x007fe001,0x007fe001
 .long	0x007fe001,0x007fe001,0x007fe001,0x007fe001
@@ -16685,8 +16725,8 @@ L_mldsa_rej_q:
 .p2align	5
 #endif /* __APPLE__ */
 L_mldsa_rej_mask:
-.quad	0x7fffff007fffff, 0x7fffff007fffff
-.quad	0x7fffff007fffff, 0x7fffff007fffff
+.quad	0x007fffff007fffff,0x007fffff007fffff
+.quad	0x007fffff007fffff,0x007fffff007fffff
 #ifndef __APPLE__
 .data
 #else
@@ -16698,8 +16738,8 @@ L_mldsa_rej_mask:
 .p2align	5
 #endif /* __APPLE__ */
 L_mldsa_rej_shuffle:
-.quad	0x5040300020100, 0xb0a0900080706
-.quad	0x9080700060504, 0xf0e0d000c0b0a
+.quad	0x0005040300020100,0x000b0a0900080706
+.quad	0x0009080700060504,0x000f0e0d000c0b0a
 #ifndef __APPLE__
 .data
 #else
@@ -16711,8 +16751,8 @@ L_mldsa_rej_shuffle:
 .p2align	5
 #endif /* __APPLE__ */
 L_mldsa_rej_ones:
-.quad	0x101010101010101, 0x101010101010101
-.quad	0x101010101010101, 0x101010101010101
+.quad	0x0101010101010101,0x0101010101010101
+.quad	0x0101010101010101,0x0101010101010101
 #ifndef __APPLE__
 .text
 .globl	wc_mldsa_rej_uniform_n_avx2
@@ -17307,9 +17347,9 @@ L_mldsa_rej_uniform_avx2_done_64:
 .section	__DATA,__data
 #endif /* __APPLE__ */
 #ifndef __APPLE__
-.align	16
+.align	32
 #else
-.p2align	4
+.p2align	5
 #endif /* __APPLE__ */
 L_mldsa_shufb_rej_idx:
 .quad	0xffffffffffffffff,0xffffffffffffffff
@@ -17579,14 +17619,8 @@ L_mldsa_shufb_rej_idx:
 .p2align	4
 #endif /* __APPLE__ */
 L_mldsa_extract_coeffs_eta2_mask_nibbles:
-.value	0x000f,0x000f
-.value	0x000f,0x000f
-.value	0x000f,0x000f
-.value	0x000f,0x000f
-.value	0x000f,0x000f
-.value	0x000f,0x000f
-.value	0x000f,0x000f
-.value	0x000f,0x000f
+.short	0x000f,0x000f,0x000f,0x000f,0x000f,0x000f,0x000f,0x000f
+.short	0x000f,0x000f,0x000f,0x000f,0x000f,0x000f,0x000f,0x000f
 #ifndef __APPLE__
 .data
 #else
@@ -17598,14 +17632,8 @@ L_mldsa_extract_coeffs_eta2_mask_nibbles:
 .p2align	4
 #endif /* __APPLE__ */
 L_mldsa_extract_coeffs_eta2_mul:
-.value	0x3340,0x3340
-.value	0x3340,0x3340
-.value	0x3340,0x3340
-.value	0x3340,0x3340
-.value	0x3340,0x3340
-.value	0x3340,0x3340
-.value	0x3340,0x3340
-.value	0x3340,0x3340
+.short	0x3340,0x3340,0x3340,0x3340,0x3340,0x3340,0x3340,0x3340
+.short	0x3340,0x3340,0x3340,0x3340,0x3340,0x3340,0x3340,0x3340
 #ifndef __APPLE__
 .data
 #else
@@ -17617,14 +17645,8 @@ L_mldsa_extract_coeffs_eta2_mul:
 .p2align	4
 #endif /* __APPLE__ */
 L_mldsa_extract_coeffs_eta2_five:
-.value	0x0005,0x0005
-.value	0x0005,0x0005
-.value	0x0005,0x0005
-.value	0x0005,0x0005
-.value	0x0005,0x0005
-.value	0x0005,0x0005
-.value	0x0005,0x0005
-.value	0x0005,0x0005
+.short	0x0005,0x0005,0x0005,0x0005,0x0005,0x0005,0x0005,0x0005
+.short	0x0005,0x0005,0x0005,0x0005,0x0005,0x0005,0x0005,0x0005
 #ifndef __APPLE__
 .data
 #else
@@ -17636,19 +17658,18 @@ L_mldsa_extract_coeffs_eta2_five:
 .p2align	4
 #endif /* __APPLE__ */
 L_mldsa_extract_coeffs_eta2_two:
-.value	0x0002,0x0002
-.value	0x0002,0x0002
-.value	0x0002,0x0002
-.value	0x0002,0x0002
-.value	0x0002,0x0002
-.value	0x0002,0x0002
-.value	0x0002,0x0002
-.value	0x0002,0x0002
+.short	0x0002,0x0002,0x0002,0x0002,0x0002,0x0002,0x0002,0x0002
+.short	0x0002,0x0002,0x0002,0x0002,0x0002,0x0002,0x0002,0x0002
 #ifndef __APPLE__
 .data
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_mldsa_extract_coeffs_eta2_nibble_table:
 .long	0x00000002,0x00000001,0x00000000,0xffffffff
 .long	0xfffffffe,0x00000002,0x00000001,0x00000000
@@ -18228,7 +18249,7 @@ L_mldsa_extract_coeffs_eta2_start_byte:
         movl	(%r13,%rbx,4), %r12d
         movl	%r12d, (%rdx)
         addl	%r11d, %r8d
-        shl	$2, %r11d
+        shll	$2, %r11d
         addq	%r11, %rdx
         cmpl	$0x100, %r8d
         je	L_mldsa_extract_coeffs_eta2_done
@@ -18238,7 +18259,7 @@ L_mldsa_extract_coeffs_eta2_start_byte:
         movl	(%r13,%rax,4), %r12d
         movl	%r12d, (%rdx)
         addl	%r11d, %r8d
-        shl	$2, %r11d
+        shll	$2, %r11d
         addq	%r11, %rdx
         cmpl	$0x100, %r8d
         je	L_mldsa_extract_coeffs_eta2_done
@@ -18264,14 +18285,8 @@ L_mldsa_extract_coeffs_eta2_done:
 .p2align	4
 #endif /* __APPLE__ */
 L_mldsa_extract_coeffs_eta4_mask_nibbles:
-.value	0x000f,0x000f
-.value	0x000f,0x000f
-.value	0x000f,0x000f
-.value	0x000f,0x000f
-.value	0x000f,0x000f
-.value	0x000f,0x000f
-.value	0x000f,0x000f
-.value	0x000f,0x000f
+.short	0x000f,0x000f,0x000f,0x000f,0x000f,0x000f,0x000f,0x000f
+.short	0x000f,0x000f,0x000f,0x000f,0x000f,0x000f,0x000f,0x000f
 #ifndef __APPLE__
 .data
 #else
@@ -18283,14 +18298,8 @@ L_mldsa_extract_coeffs_eta4_mask_nibbles:
 .p2align	4
 #endif /* __APPLE__ */
 L_mldsa_extract_coeffs_eta4_nine:
-.value	0x0009,0x0009
-.value	0x0009,0x0009
-.value	0x0009,0x0009
-.value	0x0009,0x0009
-.value	0x0009,0x0009
-.value	0x0009,0x0009
-.value	0x0009,0x0009
-.value	0x0009,0x0009
+.short	0x0009,0x0009,0x0009,0x0009,0x0009,0x0009,0x0009,0x0009
+.short	0x0009,0x0009,0x0009,0x0009,0x0009,0x0009,0x0009,0x0009
 #ifndef __APPLE__
 .data
 #else
@@ -18302,14 +18311,8 @@ L_mldsa_extract_coeffs_eta4_nine:
 .p2align	4
 #endif /* __APPLE__ */
 L_mldsa_extract_coeffs_eta4_four:
-.value	0x0004,0x0004
-.value	0x0004,0x0004
-.value	0x0004,0x0004
-.value	0x0004,0x0004
-.value	0x0004,0x0004
-.value	0x0004,0x0004
-.value	0x0004,0x0004
-.value	0x0004,0x0004
+.short	0x0004,0x0004,0x0004,0x0004,0x0004,0x0004,0x0004,0x0004
+.short	0x0004,0x0004,0x0004,0x0004,0x0004,0x0004,0x0004,0x0004
 #ifndef __APPLE__
 .text
 .globl	wc_mldsa_extract_coeffs_eta4_avx2
@@ -18832,7 +18835,7 @@ L_mldsa_extract_coeffs_eta4_start_byte:
         subl	%ebx, %r12d
         movl	%r12d, (%rdx)
         addl	%r11d, %r8d
-        shl	$2, %r11d
+        shll	$2, %r11d
         addq	%r11, %rdx
         cmpl	$0x100, %r8d
         je	L_mldsa_extract_coeffs_eta4_done
@@ -18843,7 +18846,7 @@ L_mldsa_extract_coeffs_eta4_start_byte:
         subl	%eax, %r12d
         movl	%r12d, (%rdx)
         addl	%r11d, %r8d
-        shl	$2, %r11d
+        shll	$2, %r11d
         addq	%r11, %rdx
         cmpl	$0x100, %r8d
         je	L_mldsa_extract_coeffs_eta4_done
@@ -19057,6 +19060,11 @@ _wc_mldsa_redistribute_17_rand_avx2:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_mldsa_encode_eta_2_avx2_two:
 .long	0x00000002,0x00000002,0x00000002,0x00000002
 .long	0x00000002,0x00000002,0x00000002,0x00000002
@@ -19065,6 +19073,11 @@ L_mldsa_encode_eta_2_avx2_two:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_mldsa_encode_eta_2_avx2_vs_3:
 .long	0x00000000,0x00000003,0x00000006,0x00000009
 .long	0x00000004,0x00000007,0x0000000a,0x0000000d
@@ -19079,14 +19092,10 @@ L_mldsa_encode_eta_2_avx2_vs_3:
 .p2align	4
 #endif /* __APPLE__ */
 L_mldsa_encode_eta_2_avx2_shuff_3_even:
-.value	0xff00,0x504
-.value	0xff08,0xd0c
-.value	0xffff,0xffff
-.value	0xffff,0xffff
-.value	0xff00,0x504
-.value	0xff08,0xd0c
-.value	0xffff,0xffff
-.value	0xffff,0xffff
+.byte	0x00,0xff,0x04,0x05,0x08,0xff,0x0c,0x0d
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+.byte	0x00,0xff,0x04,0x05,0x08,0xff,0x0c,0x0d
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
 #ifndef __APPLE__
 .data
 #else
@@ -19098,14 +19107,10 @@ L_mldsa_encode_eta_2_avx2_shuff_3_even:
 .p2align	4
 #endif /* __APPLE__ */
 L_mldsa_encode_eta_2_avx2_shuff_3_odd:
-.value	0xff02,0x7ff
-.value	0xb0a,0xfff
-.value	0xffff,0xffff
-.value	0xffff,0xffff
-.value	0xff02,0x7ff
-.value	0xb0a,0xfff
-.value	0xffff,0xffff
-.value	0xffff,0xffff
+.byte	0x02,0xff,0xff,0x07,0x0a,0x0b,0xff,0x0f
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+.byte	0x02,0xff,0xff,0x07,0x0a,0x0b,0xff,0x0f
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
 #ifndef __APPLE__
 .data
 #else
@@ -19117,14 +19122,10 @@ L_mldsa_encode_eta_2_avx2_shuff_3_odd:
 .p2align	4
 #endif /* __APPLE__ */
 L_mldsa_encode_eta_2_avx2_shuff_6_even:
-.value	0x400,0x805
-.value	0xd0c,0xffff
-.value	0xffff,0xffff
-.value	0xffff,0xffff
-.value	0xffff,0xffff
-.value	0xffff,0x400
-.value	0x805,0xd0c
-.value	0xffff,0xffff
+.byte	0x00,0x04,0x05,0x08,0x0c,0x0d,0xff,0xff
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x04
+.byte	0x05,0x08,0x0c,0x0d,0xff,0xff,0xff,0xff
 #ifndef __APPLE__
 .data
 #else
@@ -19136,14 +19137,10 @@ L_mldsa_encode_eta_2_avx2_shuff_6_even:
 .p2align	4
 #endif /* __APPLE__ */
 L_mldsa_encode_eta_2_avx2_shuff_6_odd:
-.value	0x302,0xa07
-.value	0xf0b,0xffff
-.value	0xffff,0xffff
-.value	0xffff,0xffff
-.value	0xffff,0xffff
-.value	0xffff,0x302
-.value	0xa07,0xf0b
-.value	0xffff,0xffff
+.byte	0x02,0x03,0x07,0x0a,0x0b,0x0f,0xff,0xff
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0x02,0x03
+.byte	0x07,0x0a,0x0b,0x0f,0xff,0xff,0xff,0xff
 #ifndef __APPLE__
 .text
 .globl	wc_mldsa_vec_encode_eta_2_avx2
@@ -19424,6 +19421,11 @@ L_mldsa_encode_eta_2_avx2_loop:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_mldsa_encode_eta_4_avx2_four:
 .long	0x00000004,0x00000004,0x00000004,0x00000004
 .long	0x00000004,0x00000004,0x00000004,0x00000004
@@ -19432,6 +19434,11 @@ L_mldsa_encode_eta_4_avx2_four:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_mldsa_encode_eta_4_avx2_vs_4:
 .long	0x00000000,0x00000004,0x00000000,0x00000004
 .long	0x00000000,0x00000004,0x00000000,0x00000004
@@ -20450,14 +20457,10 @@ _wc_mldsa_vec_encode_eta_4_avx2:
 .p2align	4
 #endif /* __APPLE__ */
 L_mldsa_decode_eta_2_avx2_shuff_0:
-.value	0xff00,0xffff
-.value	0xff00,0xffff
-.value	0x100,0xffff
-.value	0xff01,0xffff
-.value	0xff01,0xffff
-.value	0x201,0xffff
-.value	0xff02,0xffff
-.value	0xff02,0xffff
+.byte	0x00,0xff,0xff,0xff,0x00,0xff,0xff,0xff
+.byte	0x00,0x01,0xff,0xff,0x01,0xff,0xff,0xff
+.byte	0x01,0xff,0xff,0xff,0x01,0x02,0xff,0xff
+.byte	0x02,0xff,0xff,0xff,0x02,0xff,0xff,0xff
 #ifndef __APPLE__
 .data
 #else
@@ -20469,14 +20472,10 @@ L_mldsa_decode_eta_2_avx2_shuff_0:
 .p2align	4
 #endif /* __APPLE__ */
 L_mldsa_decode_eta_2_avx2_shuff_1:
-.value	0xff01,0xffff
-.value	0xff01,0xffff
-.value	0x201,0xffff
-.value	0xff02,0xffff
-.value	0xff02,0xffff
-.value	0x302,0xffff
-.value	0xff03,0xffff
-.value	0xff03,0xffff
+.byte	0x01,0xff,0xff,0xff,0x01,0xff,0xff,0xff
+.byte	0x01,0x02,0xff,0xff,0x02,0xff,0xff,0xff
+.byte	0x02,0xff,0xff,0xff,0x02,0x03,0xff,0xff
+.byte	0x03,0xff,0xff,0xff,0x03,0xff,0xff,0xff
 #ifndef __APPLE__
 .data
 #else
@@ -20488,14 +20487,10 @@ L_mldsa_decode_eta_2_avx2_shuff_1:
 .p2align	4
 #endif /* __APPLE__ */
 L_mldsa_decode_eta_2_avx2_shuff_2:
-.value	0xff02,0xffff
-.value	0xff02,0xffff
-.value	0x302,0xffff
-.value	0xff03,0xffff
-.value	0xff03,0xffff
-.value	0x403,0xffff
-.value	0xff04,0xffff
-.value	0xff04,0xffff
+.byte	0x02,0xff,0xff,0xff,0x02,0xff,0xff,0xff
+.byte	0x02,0x03,0xff,0xff,0x03,0xff,0xff,0xff
+.byte	0x03,0xff,0xff,0xff,0x03,0x04,0xff,0xff
+.byte	0x04,0xff,0xff,0xff,0x04,0xff,0xff,0xff
 #ifndef __APPLE__
 .data
 #else
@@ -20507,14 +20502,10 @@ L_mldsa_decode_eta_2_avx2_shuff_2:
 .p2align	4
 #endif /* __APPLE__ */
 L_mldsa_decode_eta_2_avx2_shuff_3:
-.value	0xff03,0xffff
-.value	0xff03,0xffff
-.value	0x403,0xffff
-.value	0xff04,0xffff
-.value	0xff04,0xffff
-.value	0x504,0xffff
-.value	0xff05,0xffff
-.value	0xff05,0xffff
+.byte	0x03,0xff,0xff,0xff,0x03,0xff,0xff,0xff
+.byte	0x03,0x04,0xff,0xff,0x04,0xff,0xff,0xff
+.byte	0x04,0xff,0xff,0xff,0x04,0x05,0xff,0xff
+.byte	0x05,0xff,0xff,0xff,0x05,0xff,0xff,0xff
 #ifndef __APPLE__
 .data
 #else
@@ -20526,14 +20517,10 @@ L_mldsa_decode_eta_2_avx2_shuff_3:
 .p2align	4
 #endif /* __APPLE__ */
 L_mldsa_decode_eta_2_avx2_shuff_4:
-.value	0xff04,0xffff
-.value	0xff04,0xffff
-.value	0x504,0xffff
-.value	0xff05,0xffff
-.value	0xff05,0xffff
-.value	0x605,0xffff
-.value	0xff06,0xffff
-.value	0xff06,0xffff
+.byte	0x04,0xff,0xff,0xff,0x04,0xff,0xff,0xff
+.byte	0x04,0x05,0xff,0xff,0x05,0xff,0xff,0xff
+.byte	0x05,0xff,0xff,0xff,0x05,0x06,0xff,0xff
+.byte	0x06,0xff,0xff,0xff,0x06,0xff,0xff,0xff
 #ifndef __APPLE__
 .data
 #else
@@ -20545,14 +20532,10 @@ L_mldsa_decode_eta_2_avx2_shuff_4:
 .p2align	4
 #endif /* __APPLE__ */
 L_mldsa_decode_eta_2_avx2_shuff_5:
-.value	0xff05,0xffff
-.value	0xff05,0xffff
-.value	0x605,0xffff
-.value	0xff06,0xffff
-.value	0xff06,0xffff
-.value	0x706,0xffff
-.value	0xff07,0xffff
-.value	0xff07,0xffff
+.byte	0x05,0xff,0xff,0xff,0x05,0xff,0xff,0xff
+.byte	0x05,0x06,0xff,0xff,0x06,0xff,0xff,0xff
+.byte	0x06,0xff,0xff,0xff,0x06,0x07,0xff,0xff
+.byte	0x07,0xff,0xff,0xff,0x07,0xff,0xff,0xff
 #ifndef __APPLE__
 .data
 #else
@@ -20564,14 +20547,10 @@ L_mldsa_decode_eta_2_avx2_shuff_5:
 .p2align	4
 #endif /* __APPLE__ */
 L_mldsa_decode_eta_2_avx2_shuff_6:
-.value	0xff06,0xffff
-.value	0xff06,0xffff
-.value	0x706,0xffff
-.value	0xff07,0xffff
-.value	0xff07,0xffff
-.value	0x807,0xffff
-.value	0xff08,0xffff
-.value	0xff08,0xffff
+.byte	0x06,0xff,0xff,0xff,0x06,0xff,0xff,0xff
+.byte	0x06,0x07,0xff,0xff,0x07,0xff,0xff,0xff
+.byte	0x07,0xff,0xff,0xff,0x07,0x08,0xff,0xff
+.byte	0x08,0xff,0xff,0xff,0x08,0xff,0xff,0xff
 #ifndef __APPLE__
 .data
 #else
@@ -20583,19 +20562,20 @@ L_mldsa_decode_eta_2_avx2_shuff_6:
 .p2align	4
 #endif /* __APPLE__ */
 L_mldsa_decode_eta_2_avx2_shuff_7:
-.value	0xff07,0xffff
-.value	0xff07,0xffff
-.value	0x807,0xffff
-.value	0xff08,0xffff
-.value	0xff08,0xffff
-.value	0x908,0xffff
-.value	0xff09,0xffff
-.value	0xff09,0xffff
+.byte	0x07,0xff,0xff,0xff,0x07,0xff,0xff,0xff
+.byte	0x07,0x08,0xff,0xff,0x08,0xff,0xff,0xff
+.byte	0x08,0xff,0xff,0xff,0x08,0x09,0xff,0xff
+.byte	0x09,0xff,0xff,0xff,0x09,0xff,0xff,0xff
 #ifndef __APPLE__
 .data
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_mldsa_decode_eta_2_avx2_two:
 .long	0x00000002,0x00000002,0x00000002,0x00000002
 .long	0x00000002,0x00000002,0x00000002,0x00000002
@@ -20604,6 +20584,11 @@ L_mldsa_decode_eta_2_avx2_two:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_mldsa_decode_eta_2_avx2_vs:
 .long	0x00000000,0x00000003,0x00000006,0x00000001
 .long	0x00000004,0x00000007,0x00000002,0x00000005
@@ -20612,6 +20597,11 @@ L_mldsa_decode_eta_2_avx2_vs:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_mldsa_decode_eta_2_avx2_mask:
 .long	0x00000007,0x00000007,0x00000007,0x00000007
 .long	0x00000007,0x00000007,0x00000007,0x00000007
@@ -20863,14 +20853,10 @@ _wc_mldsa_decode_eta_2_avx2:
 .p2align	4
 #endif /* __APPLE__ */
 L_mldsa_decode_eta_4_avx2_shuff_0:
-.value	0xff00,0xffff
-.value	0xff00,0xffff
-.value	0x100,0xffff
-.value	0xff01,0xffff
-.value	0xff01,0xffff
-.value	0x201,0xffff
-.value	0xff02,0xffff
-.value	0xff02,0xffff
+.byte	0x00,0xff,0xff,0xff,0x00,0xff,0xff,0xff
+.byte	0x00,0x01,0xff,0xff,0x01,0xff,0xff,0xff
+.byte	0x01,0xff,0xff,0xff,0x01,0x02,0xff,0xff
+.byte	0x02,0xff,0xff,0xff,0x02,0xff,0xff,0xff
 #ifndef __APPLE__
 .data
 #else
@@ -20882,14 +20868,10 @@ L_mldsa_decode_eta_4_avx2_shuff_0:
 .p2align	4
 #endif /* __APPLE__ */
 L_mldsa_decode_eta_4_avx2_shuff_1:
-.value	0xff01,0xffff
-.value	0xff01,0xffff
-.value	0x201,0xffff
-.value	0xff02,0xffff
-.value	0xff02,0xffff
-.value	0x302,0xffff
-.value	0xff03,0xffff
-.value	0xff03,0xffff
+.byte	0x01,0xff,0xff,0xff,0x01,0xff,0xff,0xff
+.byte	0x01,0x02,0xff,0xff,0x02,0xff,0xff,0xff
+.byte	0x02,0xff,0xff,0xff,0x02,0x03,0xff,0xff
+.byte	0x03,0xff,0xff,0xff,0x03,0xff,0xff,0xff
 #ifndef __APPLE__
 .data
 #else
@@ -20901,14 +20883,10 @@ L_mldsa_decode_eta_4_avx2_shuff_1:
 .p2align	4
 #endif /* __APPLE__ */
 L_mldsa_decode_eta_4_avx2_shuff_2:
-.value	0xff02,0xffff
-.value	0xff02,0xffff
-.value	0x302,0xffff
-.value	0xff03,0xffff
-.value	0xff03,0xffff
-.value	0x403,0xffff
-.value	0xff04,0xffff
-.value	0xff04,0xffff
+.byte	0x02,0xff,0xff,0xff,0x02,0xff,0xff,0xff
+.byte	0x02,0x03,0xff,0xff,0x03,0xff,0xff,0xff
+.byte	0x03,0xff,0xff,0xff,0x03,0x04,0xff,0xff
+.byte	0x04,0xff,0xff,0xff,0x04,0xff,0xff,0xff
 #ifndef __APPLE__
 .data
 #else
@@ -20920,14 +20898,10 @@ L_mldsa_decode_eta_4_avx2_shuff_2:
 .p2align	4
 #endif /* __APPLE__ */
 L_mldsa_decode_eta_4_avx2_shuff_3:
-.value	0xff03,0xffff
-.value	0xff03,0xffff
-.value	0x403,0xffff
-.value	0xff04,0xffff
-.value	0xff04,0xffff
-.value	0x504,0xffff
-.value	0xff05,0xffff
-.value	0xff05,0xffff
+.byte	0x03,0xff,0xff,0xff,0x03,0xff,0xff,0xff
+.byte	0x03,0x04,0xff,0xff,0x04,0xff,0xff,0xff
+.byte	0x04,0xff,0xff,0xff,0x04,0x05,0xff,0xff
+.byte	0x05,0xff,0xff,0xff,0x05,0xff,0xff,0xff
 #ifndef __APPLE__
 .data
 #else
@@ -20939,14 +20913,10 @@ L_mldsa_decode_eta_4_avx2_shuff_3:
 .p2align	4
 #endif /* __APPLE__ */
 L_mldsa_decode_eta_4_avx2_shuff_4:
-.value	0xff04,0xffff
-.value	0xff04,0xffff
-.value	0x504,0xffff
-.value	0xff05,0xffff
-.value	0xff05,0xffff
-.value	0x605,0xffff
-.value	0xff06,0xffff
-.value	0xff06,0xffff
+.byte	0x04,0xff,0xff,0xff,0x04,0xff,0xff,0xff
+.byte	0x04,0x05,0xff,0xff,0x05,0xff,0xff,0xff
+.byte	0x05,0xff,0xff,0xff,0x05,0x06,0xff,0xff
+.byte	0x06,0xff,0xff,0xff,0x06,0xff,0xff,0xff
 #ifndef __APPLE__
 .data
 #else
@@ -20958,14 +20928,10 @@ L_mldsa_decode_eta_4_avx2_shuff_4:
 .p2align	4
 #endif /* __APPLE__ */
 L_mldsa_decode_eta_4_avx2_shuff_5:
-.value	0xff05,0xffff
-.value	0xff05,0xffff
-.value	0x605,0xffff
-.value	0xff06,0xffff
-.value	0xff06,0xffff
-.value	0x706,0xffff
-.value	0xff07,0xffff
-.value	0xff07,0xffff
+.byte	0x05,0xff,0xff,0xff,0x05,0xff,0xff,0xff
+.byte	0x05,0x06,0xff,0xff,0x06,0xff,0xff,0xff
+.byte	0x06,0xff,0xff,0xff,0x06,0x07,0xff,0xff
+.byte	0x07,0xff,0xff,0xff,0x07,0xff,0xff,0xff
 #ifndef __APPLE__
 .data
 #else
@@ -20977,14 +20943,10 @@ L_mldsa_decode_eta_4_avx2_shuff_5:
 .p2align	4
 #endif /* __APPLE__ */
 L_mldsa_decode_eta_4_avx2_shuff_6:
-.value	0xff06,0xffff
-.value	0xff06,0xffff
-.value	0x706,0xffff
-.value	0xff07,0xffff
-.value	0xff07,0xffff
-.value	0x807,0xffff
-.value	0xff08,0xffff
-.value	0xff08,0xffff
+.byte	0x06,0xff,0xff,0xff,0x06,0xff,0xff,0xff
+.byte	0x06,0x07,0xff,0xff,0x07,0xff,0xff,0xff
+.byte	0x07,0xff,0xff,0xff,0x07,0x08,0xff,0xff
+.byte	0x08,0xff,0xff,0xff,0x08,0xff,0xff,0xff
 #ifndef __APPLE__
 .data
 #else
@@ -20996,19 +20958,20 @@ L_mldsa_decode_eta_4_avx2_shuff_6:
 .p2align	4
 #endif /* __APPLE__ */
 L_mldsa_decode_eta_4_avx2_shuff_7:
-.value	0xff07,0xffff
-.value	0xff07,0xffff
-.value	0x807,0xffff
-.value	0xff08,0xffff
-.value	0xff08,0xffff
-.value	0x908,0xffff
-.value	0xff09,0xffff
-.value	0xff09,0xffff
+.byte	0x07,0xff,0xff,0xff,0x07,0xff,0xff,0xff
+.byte	0x07,0x08,0xff,0xff,0x08,0xff,0xff,0xff
+.byte	0x08,0xff,0xff,0xff,0x08,0x09,0xff,0xff
+.byte	0x09,0xff,0xff,0xff,0x09,0xff,0xff,0xff
 #ifndef __APPLE__
 .data
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_mldsa_decode_eta_4_avx2_four:
 .long	0x00000004,0x00000004,0x00000004,0x00000004
 .long	0x00000004,0x00000004,0x00000004,0x00000004
@@ -21017,6 +20980,11 @@ L_mldsa_decode_eta_4_avx2_four:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_mldsa_decode_eta_4_avx2_vs:
 .long	0x00000000,0x00000004,0x00000008,0x0000000c
 .long	0x00000010,0x00000014,0x00000018,0x0000001c
@@ -21025,6 +20993,11 @@ L_mldsa_decode_eta_4_avx2_vs:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_mldsa_decode_eta_4_avx2_mask:
 .long	0x0000000f,0x0000000f,0x0000000f,0x0000000f
 .long	0x0000000f,0x0000000f,0x0000000f,0x0000000f
@@ -21220,14 +21193,10 @@ _wc_mldsa_decode_eta_4_avx2:
 .p2align	4
 #endif /* __APPLE__ */
 L_mldsa_encode_w1_88_avx2_shuff_0_even:
-.value	0x900,0xff0a
-.value	0xffff,0xffff
-.value	0xffff,0xffff
-.value	0xffff,0xffff
-.value	0xffff,0xff
-.value	0xa09,0xffff
-.value	0xffff,0xffff
-.value	0xffff,0xffff
+.byte	0x00,0x09,0x0a,0xff,0xff,0xff,0xff,0xff
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+.byte	0xff,0xff,0xff,0x00,0x09,0x0a,0xff,0xff
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
 #ifndef __APPLE__
 .data
 #else
@@ -21239,14 +21208,10 @@ L_mldsa_encode_w1_88_avx2_shuff_0_even:
 .p2align	4
 #endif /* __APPLE__ */
 L_mldsa_encode_w1_88_avx2_shuff_0_odd:
-.value	0x504,0xff0e
-.value	0xffff,0xffff
-.value	0xffff,0xffff
-.value	0xffff,0xffff
-.value	0xffff,0x4ff
-.value	0xe05,0xffff
-.value	0xffff,0xffff
-.value	0xffff,0xffff
+.byte	0x04,0x05,0x0e,0xff,0xff,0xff,0xff,0xff
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+.byte	0xff,0xff,0xff,0x04,0x05,0x0e,0xff,0xff
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
 #ifndef __APPLE__
 .data
 #else
@@ -21258,14 +21223,10 @@ L_mldsa_encode_w1_88_avx2_shuff_0_odd:
 .p2align	4
 #endif /* __APPLE__ */
 L_mldsa_encode_w1_88_avx2_shuff_1_even:
-.value	0xffff,0xffff
-.value	0xffff,0x900
-.value	0xff0a,0xffff
-.value	0xffff,0xffff
-.value	0xffff,0xffff
-.value	0xffff,0xffff
-.value	0xff,0xa09
-.value	0xffff,0xffff
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x09
+.byte	0x0a,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+.byte	0xff,0x00,0x09,0x0a,0xff,0xff,0xff,0xff
 #ifndef __APPLE__
 .data
 #else
@@ -21277,19 +21238,20 @@ L_mldsa_encode_w1_88_avx2_shuff_1_even:
 .p2align	4
 #endif /* __APPLE__ */
 L_mldsa_encode_w1_88_avx2_shuff_1_odd:
-.value	0xffff,0xffff
-.value	0xffff,0x504
-.value	0xff0e,0xffff
-.value	0xffff,0xffff
-.value	0xffff,0xffff
-.value	0xffff,0xffff
-.value	0x4ff,0xe05
-.value	0xffff,0xffff
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0x04,0x05
+.byte	0x0e,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+.byte	0xff,0x04,0x05,0x0e,0xff,0xff,0xff,0xff
 #ifndef __APPLE__
 .data
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_mldsa_encode_w1_88_avx2_vs:
 .long	0x00000000,0x00000006,0x0000000c,0x00000012
 .long	0x00000000,0x00000006,0x0000000c,0x00000012
@@ -21577,6 +21539,11 @@ _wc_mldsa_encode_w1_88_avx2:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_mldsa_encode_w1_32_avx2_vs_4:
 .long	0x00000000,0x00000004,0x00000000,0x00000004
 .long	0x00000000,0x00000004,0x00000000,0x00000004
@@ -21736,6 +21703,11 @@ _wc_mldsa_encode_w1_32_avx2:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_mldsa_encode_t0_t1_avx2_d_max_half_m1:
 .long	0x00000fff,0x00000fff,0x00000fff,0x00000fff
 .long	0x00000fff,0x00000fff,0x00000fff,0x00000fff
@@ -21744,6 +21716,11 @@ L_mldsa_encode_t0_t1_avx2_d_max_half_m1:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_mldsa_encode_t0_t1_avx2_d_max_half:
 .long	0x00001000,0x00001000,0x00001000,0x00001000
 .long	0x00001000,0x00001000,0x00001000,0x00001000
@@ -21752,6 +21729,11 @@ L_mldsa_encode_t0_t1_avx2_d_max_half:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_mldsa_encode_t0_t1_avx2_vs_13:
 .long	0x00000000,0x0000000d,0x00000002,0x0000000f
 .long	0x00000004,0x00000011,0x00000006,0x00000013
@@ -21766,14 +21748,10 @@ L_mldsa_encode_t0_t1_avx2_vs_13:
 .p2align	4
 #endif /* __APPLE__ */
 L_mldsa_encode_t0_t1_avx2_shuff_13_even:
-.value	0x100,0x8ff
-.value	0xff09,0xffff
-.value	0xffff,0xffff
-.value	0xffff,0xffff
-.value	0xffff,0xffff
-.value	0xffff,0x100
-.value	0x802,0xa09
-.value	0xffff,0xffff
+.byte	0x00,0x01,0xff,0x08,0x09,0xff,0xff,0xff
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x01
+.byte	0x02,0x08,0x09,0x0a,0xff,0xff,0xff,0xff
 #ifndef __APPLE__
 .data
 #else
@@ -21785,19 +21763,20 @@ L_mldsa_encode_t0_t1_avx2_shuff_13_even:
 .p2align	4
 #endif /* __APPLE__ */
 L_mldsa_encode_t0_t1_avx2_shuff_13_odd:
-.value	0x5ff,0x706
-.value	0xe0d,0xff0f
-.value	0xffff,0xffff
-.value	0xffff,0xffff
-.value	0xffff,0xffff
-.value	0xffff,0xffff
-.value	0x706,0xeff
-.value	0xff0f,0xffff
+.byte	0xff,0x05,0x06,0x07,0x0d,0x0e,0x0f,0xff
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+.byte	0x06,0x07,0xff,0x0e,0x0f,0xff,0xff,0xff
 #ifndef __APPLE__
 .data
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_mldsa_encode_t0_t1_avx2_vs_10:
 .long	0x00000000,0x0000000a,0x00000004,0x0000000e
 .long	0x00000000,0x0000000a,0x00000004,0x0000000e
@@ -21812,14 +21791,10 @@ L_mldsa_encode_t0_t1_avx2_vs_10:
 .p2align	4
 #endif /* __APPLE__ */
 L_mldsa_encode_t0_t1_avx2_shuff_10_even:
-.value	0x100,0x908
-.value	0xffff,0xffff
-.value	0xffff,0xffff
-.value	0xffff,0xffff
-.value	0xffff,0xffff
-.value	0xff,0x801
-.value	0xff09,0xffff
-.value	0xffff,0xffff
+.byte	0x00,0x01,0x08,0x09,0xff,0xff,0xff,0xff
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+.byte	0xff,0xff,0xff,0xff,0xff,0x00,0x01,0x08
+.byte	0x09,0xff,0xff,0xff,0xff,0xff,0xff,0xff
 #ifndef __APPLE__
 .data
 #else
@@ -21831,14 +21806,10 @@ L_mldsa_encode_t0_t1_avx2_shuff_10_even:
 .p2align	4
 #endif /* __APPLE__ */
 L_mldsa_encode_t0_t1_avx2_shuff_10_odd:
-.value	0x5ff,0xd06
-.value	0xff0e,0xffff
-.value	0xffff,0xffff
-.value	0xffff,0xffff
-.value	0xffff,0xffff
-.value	0xffff,0x605
-.value	0xe0d,0xffff
-.value	0xffff,0xffff
+.byte	0xff,0x05,0x06,0x0d,0x0e,0xff,0xff,0xff
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0x05,0x06
+.byte	0x0d,0x0e,0xff,0xff,0xff,0xff,0xff,0xff
 #ifndef __APPLE__
 .text
 .globl	wc_mldsa_vec_encode_t0_t1_avx2
@@ -22583,14 +22554,10 @@ L_mldsa_encode_t0_t1_avx2_loop:
 .p2align	4
 #endif /* __APPLE__ */
 L_mldsa_decode_t0_avx2_shuff_0:
-.value	0x100,0xffff
-.value	0x1ff,0x302
-.value	0x403,0xff05
-.value	0x504,0x706
-.value	0x706,0xff08
-.value	0x8ff,0xff09
-.value	0xa09,0xff0b
-.value	0xffff,0xc0b
+.byte	0x00,0x01,0xff,0xff,0xff,0x01,0x02,0x03
+.byte	0x03,0x04,0x05,0xff,0x04,0x05,0x06,0x07
+.byte	0x06,0x07,0x08,0xff,0xff,0x08,0x09,0xff
+.byte	0x09,0x0a,0x0b,0xff,0xff,0xff,0x0b,0x0c
 #ifndef __APPLE__
 .data
 #else
@@ -22602,14 +22569,10 @@ L_mldsa_decode_t0_avx2_shuff_0:
 .p2align	4
 #endif /* __APPLE__ */
 L_mldsa_decode_t0_avx2_shuff_1:
-.value	0x605,0xffff
-.value	0x6ff,0x807
-.value	0x908,0xffff
-.value	0xa09,0xff0b
-.value	0x403,0xff05
-.value	0x5ff,0xff06
-.value	0x706,0xff08
-.value	0xffff,0x908
+.byte	0x05,0x06,0xff,0xff,0xff,0x06,0x07,0x08
+.byte	0x08,0x09,0xff,0xff,0x09,0x0a,0x0b,0xff
+.byte	0x03,0x04,0x05,0xff,0xff,0x05,0x06,0xff
+.byte	0x06,0x07,0x08,0xff,0xff,0xff,0x08,0x09
 #ifndef __APPLE__
 .data
 #else
@@ -22621,14 +22584,10 @@ L_mldsa_decode_t0_avx2_shuff_1:
 .p2align	4
 #endif /* __APPLE__ */
 L_mldsa_decode_t0_avx2_shuff_2:
-.value	0x302,0xffff
-.value	0x3ff,0x504
-.value	0x605,0xffff
-.value	0x706,0xff08
-.value	0x100,0xff02
-.value	0x2ff,0xff03
-.value	0x403,0xff05
-.value	0xffff,0x605
+.byte	0x02,0x03,0xff,0xff,0xff,0x03,0x04,0x05
+.byte	0x05,0x06,0xff,0xff,0x06,0x07,0x08,0xff
+.byte	0x00,0x01,0x02,0xff,0xff,0x02,0x03,0xff
+.byte	0x03,0x04,0x05,0xff,0xff,0xff,0x05,0x06
 #ifndef __APPLE__
 .data
 #else
@@ -22640,14 +22599,10 @@ L_mldsa_decode_t0_avx2_shuff_2:
 .p2align	4
 #endif /* __APPLE__ */
 L_mldsa_decode_t0_avx2_shuff_3:
-.value	0x807,0xffff
-.value	0x8ff,0xa09
-.value	0xb0a,0xffff
-.value	0xc0b,0xff0d
-.value	0x605,0xff07
-.value	0x7ff,0xff08
-.value	0x908,0xff0a
-.value	0xffff,0xb0a
+.byte	0x07,0x08,0xff,0xff,0xff,0x08,0x09,0x0a
+.byte	0x0a,0x0b,0xff,0xff,0x0b,0x0c,0x0d,0xff
+.byte	0x05,0x06,0x07,0xff,0xff,0x07,0x08,0xff
+.byte	0x08,0x09,0x0a,0xff,0xff,0xff,0x0a,0x0b
 #ifndef __APPLE__
 .data
 #else
@@ -22659,14 +22614,10 @@ L_mldsa_decode_t0_avx2_shuff_3:
 .p2align	4
 #endif /* __APPLE__ */
 L_mldsa_decode_t0_avx2_shuff_4:
-.value	0x504,0xffff
-.value	0x5ff,0x706
-.value	0x807,0xffff
-.value	0x908,0xff0a
-.value	0x302,0xff04
-.value	0x4ff,0xff05
-.value	0x605,0xff07
-.value	0xffff,0x807
+.byte	0x04,0x05,0xff,0xff,0xff,0x05,0x06,0x07
+.byte	0x07,0x08,0xff,0xff,0x08,0x09,0x0a,0xff
+.byte	0x02,0x03,0x04,0xff,0xff,0x04,0x05,0xff
+.byte	0x05,0x06,0x07,0xff,0xff,0xff,0x07,0x08
 #ifndef __APPLE__
 .data
 #else
@@ -22678,14 +22629,10 @@ L_mldsa_decode_t0_avx2_shuff_4:
 .p2align	4
 #endif /* __APPLE__ */
 L_mldsa_decode_t0_avx2_shuff_5:
-.value	0x201,0xffff
-.value	0x2ff,0x403
-.value	0x504,0xffff
-.value	0x605,0xff07
-.value	0x807,0xff09
-.value	0x9ff,0xff0a
-.value	0xb0a,0xff0c
-.value	0xffff,0xd0c
+.byte	0x01,0x02,0xff,0xff,0xff,0x02,0x03,0x04
+.byte	0x04,0x05,0xff,0xff,0x05,0x06,0x07,0xff
+.byte	0x07,0x08,0x09,0xff,0xff,0x09,0x0a,0xff
+.byte	0x0a,0x0b,0x0c,0xff,0xff,0xff,0x0c,0x0d
 #ifndef __APPLE__
 .data
 #else
@@ -22697,14 +22644,10 @@ L_mldsa_decode_t0_avx2_shuff_5:
 .p2align	4
 #endif /* __APPLE__ */
 L_mldsa_decode_t0_avx2_shuff_6:
-.value	0x706,0xffff
-.value	0x7ff,0x908
-.value	0xa09,0xffff
-.value	0xb0a,0xff0c
-.value	0x504,0xff06
-.value	0x6ff,0x807
-.value	0x807,0xff09
-.value	0xffff,0xa09
+.byte	0x06,0x07,0xff,0xff,0xff,0x07,0x08,0x09
+.byte	0x09,0x0a,0xff,0xff,0x0a,0x0b,0x0c,0xff
+.byte	0x04,0x05,0x06,0xff,0xff,0x06,0x07,0x08
+.byte	0x07,0x08,0x09,0xff,0xff,0xff,0x09,0x0a
 #ifndef __APPLE__
 .data
 #else
@@ -22716,19 +22659,20 @@ L_mldsa_decode_t0_avx2_shuff_6:
 .p2align	4
 #endif /* __APPLE__ */
 L_mldsa_decode_t0_avx2_shuff_7:
-.value	0x403,0xffff
-.value	0x4ff,0x605
-.value	0x706,0xffff
-.value	0x807,0xff09
-.value	0x201,0xff03
-.value	0x3ff,0xff04
-.value	0x504,0xff06
-.value	0xffff,0x706
+.byte	0x03,0x04,0xff,0xff,0xff,0x04,0x05,0x06
+.byte	0x06,0x07,0xff,0xff,0x07,0x08,0x09,0xff
+.byte	0x01,0x02,0x03,0xff,0xff,0x03,0x04,0xff
+.byte	0x04,0x05,0x06,0xff,0xff,0xff,0x06,0x07
 #ifndef __APPLE__
 .data
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_mldsa_decode_t0_avx2_vs_8:
 .long	0x00000000,0x0000000d,0x00000002,0x00000007
 .long	0x00000004,0x00000009,0x00000006,0x00000013
@@ -22737,6 +22681,11 @@ L_mldsa_decode_t0_avx2_vs_8:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_mldsa_decode_t0_avx2_mask:
 .long	0x00001fff,0x00001fff,0x00001fff,0x00001fff
 .long	0x00001fff,0x00001fff,0x00001fff,0x00001fff
@@ -22745,6 +22694,11 @@ L_mldsa_decode_t0_avx2_mask:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_mldsa_decode_t0_avx2_d_max_half:
 .long	0x00001000,0x00001000,0x00001000,0x00001000
 .long	0x00001000,0x00001000,0x00001000,0x00001000
@@ -23038,14 +22992,10 @@ _wc_mldsa_decode_t0_avx2:
 .p2align	4
 #endif /* __APPLE__ */
 L_mldsa_decode_t1_avx2_shuff_0:
-.value	0x100,0xffff
-.value	0x1ff,0xff02
-.value	0x302,0xffff
-.value	0x3ff,0xff04
-.value	0x605,0xffff
-.value	0x6ff,0xff07
-.value	0x807,0xffff
-.value	0x8ff,0xff09
+.byte	0x00,0x01,0xff,0xff,0xff,0x01,0x02,0xff
+.byte	0x02,0x03,0xff,0xff,0xff,0x03,0x04,0xff
+.byte	0x05,0x06,0xff,0xff,0xff,0x06,0x07,0xff
+.byte	0x07,0x08,0xff,0xff,0xff,0x08,0x09,0xff
 #ifndef __APPLE__
 .data
 #else
@@ -23057,14 +23007,10 @@ L_mldsa_decode_t1_avx2_shuff_0:
 .p2align	4
 #endif /* __APPLE__ */
 L_mldsa_decode_t1_avx2_shuff_1:
-.value	0x302,0xffff
-.value	0x3ff,0xff04
-.value	0x504,0xffff
-.value	0x5ff,0xff06
-.value	0x807,0xffff
-.value	0x8ff,0xff09
-.value	0xa09,0xff08
-.value	0xaff,0xff0b
+.byte	0x02,0x03,0xff,0xff,0xff,0x03,0x04,0xff
+.byte	0x04,0x05,0xff,0xff,0xff,0x05,0x06,0xff
+.byte	0x07,0x08,0xff,0xff,0xff,0x08,0x09,0xff
+.byte	0x09,0x0a,0x08,0xff,0xff,0x0a,0x0b,0xff
 #ifndef __APPLE__
 .data
 #else
@@ -23076,14 +23022,10 @@ L_mldsa_decode_t1_avx2_shuff_1:
 .p2align	4
 #endif /* __APPLE__ */
 L_mldsa_decode_t1_avx2_shuff_2:
-.value	0x504,0xffff
-.value	0x5ff,0xff06
-.value	0x706,0xffff
-.value	0x7ff,0xff08
-.value	0x201,0xffff
-.value	0x2ff,0xff03
-.value	0x403,0xffff
-.value	0x4ff,0xff05
+.byte	0x04,0x05,0xff,0xff,0xff,0x05,0x06,0xff
+.byte	0x06,0x07,0xff,0xff,0xff,0x07,0x08,0xff
+.byte	0x01,0x02,0xff,0xff,0xff,0x02,0x03,0xff
+.byte	0x03,0x04,0xff,0xff,0xff,0x04,0x05,0xff
 #ifndef __APPLE__
 .data
 #else
@@ -23095,19 +23037,20 @@ L_mldsa_decode_t1_avx2_shuff_2:
 .p2align	4
 #endif /* __APPLE__ */
 L_mldsa_decode_t1_avx2_shuff_3:
-.value	0x706,0xffff
-.value	0x7ff,0xff08
-.value	0x908,0xffff
-.value	0x9ff,0xff0a
-.value	0x403,0xffff
-.value	0x4ff,0xff05
-.value	0x605,0xffff
-.value	0x6ff,0xff07
+.byte	0x06,0x07,0xff,0xff,0xff,0x07,0x08,0xff
+.byte	0x08,0x09,0xff,0xff,0xff,0x09,0x0a,0xff
+.byte	0x03,0x04,0xff,0xff,0xff,0x04,0x05,0xff
+.byte	0x05,0x06,0xff,0xff,0xff,0x06,0x07,0xff
 #ifndef __APPLE__
 .data
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_mldsa_decode_t1_avx2_vs_8:
 .long	0x00000000,0x0000000a,0x00000004,0x0000000e
 .long	0x00000000,0x0000000a,0x00000004,0x0000000e
@@ -23116,6 +23059,11 @@ L_mldsa_decode_t1_avx2_vs_8:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_mldsa_decode_t1_avx2_mask:
 .long	0x000003ff,0x000003ff,0x000003ff,0x000003ff
 .long	0x000003ff,0x000003ff,0x000003ff,0x000003ff
@@ -23396,14 +23344,10 @@ _wc_mldsa_decode_t1_avx2:
 .p2align	4
 #endif /* __APPLE__ */
 L_mldsa_decode_gamma1_17_avx2_shuff_0:
-.value	0x100,0xff02
-.value	0x302,0xff04
-.value	0x504,0xff06
-.value	0x706,0xff08
-.value	0x1ff,0x302
-.value	0x3ff,0x504
-.value	0x5ff,0x706
-.value	0x7ff,0x908
+.byte	0x00,0x01,0x02,0xff,0x02,0x03,0x04,0xff
+.byte	0x04,0x05,0x06,0xff,0x06,0x07,0x08,0xff
+.byte	0xff,0x01,0x02,0x03,0xff,0x03,0x04,0x05
+.byte	0xff,0x05,0x06,0x07,0xff,0x07,0x08,0x09
 #ifndef __APPLE__
 .data
 #else
@@ -23415,14 +23359,10 @@ L_mldsa_decode_gamma1_17_avx2_shuff_0:
 .p2align	4
 #endif /* __APPLE__ */
 L_mldsa_decode_gamma1_17_avx2_shuff_1:
-.value	0x302,0xff04
-.value	0x504,0xff06
-.value	0x706,0xff08
-.value	0x908,0xff0a
-.value	0x3ff,0x504
-.value	0x5ff,0x706
-.value	0x7ff,0x908
-.value	0x9ff,0xb0a
+.byte	0x02,0x03,0x04,0xff,0x04,0x05,0x06,0xff
+.byte	0x06,0x07,0x08,0xff,0x08,0x09,0x0a,0xff
+.byte	0xff,0x03,0x04,0x05,0xff,0x05,0x06,0x07
+.byte	0xff,0x07,0x08,0x09,0xff,0x09,0x0a,0x0b
 #ifndef __APPLE__
 .data
 #else
@@ -23434,14 +23374,10 @@ L_mldsa_decode_gamma1_17_avx2_shuff_1:
 .p2align	4
 #endif /* __APPLE__ */
 L_mldsa_decode_gamma1_17_avx2_shuff_2:
-.value	0x504,0xff06
-.value	0x706,0xff08
-.value	0x908,0xff0a
-.value	0xb0a,0xff0c
-.value	0x5ff,0x706
-.value	0x7ff,0x908
-.value	0x9ff,0xb0a
-.value	0xbff,0xd0c
+.byte	0x04,0x05,0x06,0xff,0x06,0x07,0x08,0xff
+.byte	0x08,0x09,0x0a,0xff,0x0a,0x0b,0x0c,0xff
+.byte	0xff,0x05,0x06,0x07,0xff,0x07,0x08,0x09
+.byte	0xff,0x09,0x0a,0x0b,0xff,0x0b,0x0c,0x0d
 #ifndef __APPLE__
 .data
 #else
@@ -23453,19 +23389,20 @@ L_mldsa_decode_gamma1_17_avx2_shuff_2:
 .p2align	4
 #endif /* __APPLE__ */
 L_mldsa_decode_gamma1_17_avx2_shuff_3:
-.value	0x706,0xff08
-.value	0x908,0xff0a
-.value	0xb0a,0xff0c
-.value	0xd0c,0xff0e
-.value	0x7ff,0x908
-.value	0x9ff,0xb0a
-.value	0xbff,0xd0c
-.value	0xdff,0xf0e
+.byte	0x06,0x07,0x08,0xff,0x08,0x09,0x0a,0xff
+.byte	0x0a,0x0b,0x0c,0xff,0x0c,0x0d,0x0e,0xff
+.byte	0xff,0x07,0x08,0x09,0xff,0x09,0x0a,0x0b
+.byte	0xff,0x0b,0x0c,0x0d,0xff,0x0d,0x0e,0x0f
 #ifndef __APPLE__
 .data
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_mldsa_decode_gamma1_17_avx2_vs_8:
 .long	0x00000000,0x00000002,0x00000004,0x00000006
 .long	0x00000008,0x0000000a,0x0000000c,0x0000000e
@@ -23474,6 +23411,11 @@ L_mldsa_decode_gamma1_17_avx2_vs_8:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_mldsa_decode_gamma1_17_avx2_mask:
 .long	0x0003ffff,0x0003ffff,0x0003ffff,0x0003ffff
 .long	0x0003ffff,0x0003ffff,0x0003ffff,0x0003ffff
@@ -23482,6 +23424,11 @@ L_mldsa_decode_gamma1_17_avx2_mask:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_mldsa_decode_gamma1_17_avx2_gamma17:
 .long	0x00020000,0x00020000,0x00020000,0x00020000
 .long	0x00020000,0x00020000,0x00020000,0x00020000
@@ -23778,14 +23725,10 @@ _wc_mldsa_decode_gamma1_17_avx2:
 .p2align	4
 #endif /* __APPLE__ */
 L_mldsa_decode_gamma1_20_avx2_shuff_0:
-.value	0x100,0xff02
-.value	0x302,0xff04
-.value	0x605,0xff07
-.value	0x807,0xff09
-.value	0x2ff,0x403
-.value	0x4ff,0x605
-.value	0x7ff,0x908
-.value	0x9ff,0xb0a
+.byte	0x00,0x01,0x02,0xff,0x02,0x03,0x04,0xff
+.byte	0x05,0x06,0x07,0xff,0x07,0x08,0x09,0xff
+.byte	0xff,0x02,0x03,0x04,0xff,0x04,0x05,0x06
+.byte	0xff,0x07,0x08,0x09,0xff,0x09,0x0a,0x0b
 #ifndef __APPLE__
 .data
 #else
@@ -23797,19 +23740,20 @@ L_mldsa_decode_gamma1_20_avx2_shuff_0:
 .p2align	4
 #endif /* __APPLE__ */
 L_mldsa_decode_gamma1_20_avx2_shuff_1:
-.value	0x504,0xff06
-.value	0x706,0xff08
-.value	0xa09,0xff0b
-.value	0xc0b,0xff0d
-.value	0x6ff,0x807
-.value	0x8ff,0xa09
-.value	0xbff,0xd0c
-.value	0xdff,0xf0e
+.byte	0x04,0x05,0x06,0xff,0x06,0x07,0x08,0xff
+.byte	0x09,0x0a,0x0b,0xff,0x0b,0x0c,0x0d,0xff
+.byte	0xff,0x06,0x07,0x08,0xff,0x08,0x09,0x0a
+.byte	0xff,0x0b,0x0c,0x0d,0xff,0x0d,0x0e,0x0f
 #ifndef __APPLE__
 .data
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_mldsa_decode_gamma1_20_avx2_vs_8:
 .long	0x00000000,0x00000004,0x00000000,0x00000004
 .long	0x00000008,0x0000000c,0x00000008,0x0000000c
@@ -23818,6 +23762,11 @@ L_mldsa_decode_gamma1_20_avx2_vs_8:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_mldsa_decode_gamma1_20_avx2_mask:
 .long	0x000fffff,0x000fffff,0x000fffff,0x000fffff
 .long	0x000fffff,0x000fffff,0x000fffff,0x000fffff
@@ -23826,6 +23775,11 @@ L_mldsa_decode_gamma1_20_avx2_mask:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_mldsa_decode_gamma1_20_avx2_gamma19:
 .long	0x00080000,0x00080000,0x00080000,0x00080000
 .long	0x00080000,0x00080000,0x00080000,0x00080000
@@ -24116,6 +24070,11 @@ _wc_mldsa_decode_gamma1_19_avx2:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_mldsa_encode_gamma1_17_avx2_gamma17:
 .long	0x00020000,0x00020000,0x00020000,0x00020000
 .long	0x00020000,0x00020000,0x00020000,0x00020000
@@ -24130,14 +24089,10 @@ L_mldsa_encode_gamma1_17_avx2_gamma17:
 .p2align	4
 #endif /* __APPLE__ */
 L_mldsa_encode_gamma1_17_avx2_shuff_even:
-.value	0x100,0xff02
-.value	0x908,0xff0a
-.value	0xffff,0xffff
-.value	0xffff,0xffff
-.value	0x100,0xff02
-.value	0x908,0xff0a
-.value	0xffff,0xffff
-.value	0xffff,0xffff
+.byte	0x00,0x01,0x02,0xff,0x08,0x09,0x0a,0xff
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+.byte	0x00,0x01,0x02,0xff,0x08,0x09,0x0a,0xff
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
 #ifndef __APPLE__
 .data
 #else
@@ -24149,19 +24104,20 @@ L_mldsa_encode_gamma1_17_avx2_shuff_even:
 .p2align	4
 #endif /* __APPLE__ */
 L_mldsa_encode_gamma1_17_avx2_shuff_odd:
-.value	0xffff,0x504
-.value	0xff06,0xd0c
-.value	0xff0e,0xffff
-.value	0xffff,0xffff
-.value	0xffff,0x504
-.value	0xff06,0xd0c
-.value	0xff0e,0xffff
-.value	0xffff,0xffff
+.byte	0xff,0xff,0x04,0x05,0x06,0xff,0x0c,0x0d
+.byte	0x0e,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+.byte	0xff,0xff,0x04,0x05,0x06,0xff,0x0c,0x0d
+.byte	0x0e,0xff,0xff,0xff,0xff,0xff,0xff,0xff
 #ifndef __APPLE__
 .data
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_mldsa_encode_gamma1_17_avx2_vs:
 .long	0x00000000,0x00000002,0x00000004,0x00000006
 .long	0x00000000,0x00000002,0x00000004,0x00000006
@@ -24576,6 +24532,11 @@ _wc_mldsa_encode_gamma1_17_avx2:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_mldsa_encode_gamma1_19_avx2_gamma19:
 .long	0x00080000,0x00080000,0x00080000,0x00080000
 .long	0x00080000,0x00080000,0x00080000,0x00080000
@@ -24590,14 +24551,10 @@ L_mldsa_encode_gamma1_19_avx2_gamma19:
 .p2align	4
 #endif /* __APPLE__ */
 L_mldsa_encode_gamma1_19_avx2_shuff_even:
-.value	0x100,0xff02
-.value	0x8ff,0xa09
-.value	0xffff,0xffff
-.value	0xffff,0xffff
-.value	0x100,0xff02
-.value	0x8ff,0xa09
-.value	0xffff,0xffff
-.value	0xffff,0xffff
+.byte	0x00,0x01,0x02,0xff,0xff,0x08,0x09,0x0a
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+.byte	0x00,0x01,0x02,0xff,0xff,0x08,0x09,0x0a
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
 #ifndef __APPLE__
 .data
 #else
@@ -24609,19 +24566,20 @@ L_mldsa_encode_gamma1_19_avx2_shuff_even:
 .p2align	4
 #endif /* __APPLE__ */
 L_mldsa_encode_gamma1_19_avx2_shuff_odd:
-.value	0xffff,0x504
-.value	0xff06,0xcff
-.value	0xe0d,0xffff
-.value	0xffff,0xffff
-.value	0xffff,0x504
-.value	0xff06,0xcff
-.value	0xe0d,0xffff
-.value	0xffff,0xffff
+.byte	0xff,0xff,0x04,0x05,0x06,0xff,0xff,0x0c
+.byte	0x0d,0x0e,0xff,0xff,0xff,0xff,0xff,0xff
+.byte	0xff,0xff,0x04,0x05,0x06,0xff,0xff,0x0c
+.byte	0x0d,0x0e,0xff,0xff,0xff,0xff,0xff,0xff
 #ifndef __APPLE__
 .data
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_mldsa_encode_gamma1_19_avx2_vs:
 .long	0x00000000,0x00000004,0x00000000,0x00000004
 .long	0x00000000,0x00000004,0x00000000,0x00000004
@@ -25036,6 +24994,11 @@ _wc_mldsa_encode_gamma1_19_avx2:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_mldsa_decompose_q88_avx2_q_low_88:
 .long	0x00017400,0x00017400,0x00017400,0x00017400
 .long	0x00017400,0x00017400,0x00017400,0x00017400
@@ -25044,6 +25007,11 @@ L_mldsa_decompose_q88_avx2_q_low_88:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_mldsa_decompose_q88_avx2_q_low_88_2:
 .long	0x0002e800,0x0002e800,0x0002e800,0x0002e800
 .long	0x0002e800,0x0002e800,0x0002e800,0x0002e800
@@ -25052,6 +25020,11 @@ L_mldsa_decompose_q88_avx2_q_low_88_2:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_mldsa_decompose_q88_avx2_q_2:
 .long	0x003fefd4,0x003fefd4,0x003fefd4,0x003fefd4
 .long	0x003fefd4,0x003fefd4,0x003fefd4,0x003fefd4
@@ -25060,6 +25033,11 @@ L_mldsa_decompose_q88_avx2_q_2:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_mldsa_decompose_q88_avx2_44:
 .long	0x0000002c,0x0000002c,0x0000002c,0x0000002c
 .long	0x0000002c,0x0000002c,0x0000002c,0x0000002c
@@ -27269,6 +27247,11 @@ _wc_mldsa_decompose_q88_avx2:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_mldsa_decompose_q32_avx2_q_low_32:
 .long	0x0003ff00,0x0003ff00,0x0003ff00,0x0003ff00
 .long	0x0003ff00,0x0003ff00,0x0003ff00,0x0003ff00
@@ -27277,6 +27260,11 @@ L_mldsa_decompose_q32_avx2_q_low_32:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_mldsa_decompose_q32_avx2_q_low_32_2:
 .long	0x0007fe00,0x0007fe00,0x0007fe00,0x0007fe00
 .long	0x0007fe00,0x0007fe00,0x0007fe00,0x0007fe00
@@ -27285,6 +27273,11 @@ L_mldsa_decompose_q32_avx2_q_low_32_2:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_mldsa_decompose_q32_avx2_q_low_32_m1:
 .long	0x0003feff,0x0003feff,0x0003feff,0x0003feff
 .long	0x0003feff,0x0003feff,0x0003feff,0x0003feff
@@ -27293,6 +27286,11 @@ L_mldsa_decompose_q32_avx2_q_low_32_m1:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_mldsa_decompose_q32_avx2_mask:
 .long	0x0000000f,0x0000000f,0x0000000f,0x0000000f
 .long	0x0000000f,0x0000000f,0x0000000f,0x0000000f
@@ -27808,6 +27806,11 @@ L_mldsa_decompose_q32_avx2_start_256:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_mldsa_use_hint_88_avx2_q:
 .long	0x007fe001,0x007fe001,0x007fe001,0x007fe001
 .long	0x007fe001,0x007fe001,0x007fe001,0x007fe001
@@ -27816,6 +27819,11 @@ L_mldsa_use_hint_88_avx2_q:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_mldsa_use_hint_88_avx2_q_low_88:
 .long	0x00017400,0x00017400,0x00017400,0x00017400
 .long	0x00017400,0x00017400,0x00017400,0x00017400
@@ -27824,6 +27832,11 @@ L_mldsa_use_hint_88_avx2_q_low_88:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_mldsa_use_hint_88_avx2_q_low_88_2:
 .long	0x0002e800,0x0002e800,0x0002e800,0x0002e800
 .long	0x0002e800,0x0002e800,0x0002e800,0x0002e800
@@ -27832,6 +27845,11 @@ L_mldsa_use_hint_88_avx2_q_low_88_2:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_mldsa_use_hint_88_avx2_q_2:
 .long	0x003fefd4,0x003fefd4,0x003fefd4,0x003fefd4
 .long	0x003fefd4,0x003fefd4,0x003fefd4,0x003fefd4
@@ -27840,6 +27858,11 @@ L_mldsa_use_hint_88_avx2_q_2:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_mldsa_use_hint_88_avx2_44:
 .long	0x0000002c,0x0000002c,0x0000002c,0x0000002c
 .long	0x0000002c,0x0000002c,0x0000002c,0x0000002c
@@ -27848,6 +27871,11 @@ L_mldsa_use_hint_88_avx2_44:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_mldsa_use_hint_88_avx2_vsl:
 .long	0x0000001f,0x0000001e,0x0000001d,0x0000001c
 .long	0x0000001b,0x0000001a,0x00000019,0x00000018
@@ -27856,6 +27884,11 @@ L_mldsa_use_hint_88_avx2_vsl:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_mldsa_use_hint_88_avx2_one:
 .long	0x00000001,0x00000001,0x00000001,0x00000001
 .long	0x00000001,0x00000001,0x00000001,0x00000001
@@ -33271,6 +33304,11 @@ L_mldsa_use_hint_88_avx2_hints_done_3_15:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_mldsa_use_hint_32_avx2_q:
 .long	0x007fe001,0x007fe001,0x007fe001,0x007fe001
 .long	0x007fe001,0x007fe001,0x007fe001,0x007fe001
@@ -33279,6 +33317,11 @@ L_mldsa_use_hint_32_avx2_q:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_mldsa_use_hint_32_avx2_q_low_32:
 .long	0x0003ff00,0x0003ff00,0x0003ff00,0x0003ff00
 .long	0x0003ff00,0x0003ff00,0x0003ff00,0x0003ff00
@@ -33287,6 +33330,11 @@ L_mldsa_use_hint_32_avx2_q_low_32:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_mldsa_use_hint_32_avx2_q_low_32_2:
 .long	0x0007fe00,0x0007fe00,0x0007fe00,0x0007fe00
 .long	0x0007fe00,0x0007fe00,0x0007fe00,0x0007fe00
@@ -33295,6 +33343,11 @@ L_mldsa_use_hint_32_avx2_q_low_32_2:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_mldsa_use_hint_32_avx2_q_low_32_m1:
 .long	0x0003feff,0x0003feff,0x0003feff,0x0003feff
 .long	0x0003feff,0x0003feff,0x0003feff,0x0003feff
@@ -33303,6 +33356,11 @@ L_mldsa_use_hint_32_avx2_q_low_32_m1:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_mldsa_use_hint_32_avx2_mask:
 .long	0x0000000f,0x0000000f,0x0000000f,0x0000000f
 .long	0x0000000f,0x0000000f,0x0000000f,0x0000000f
@@ -33311,6 +33369,11 @@ L_mldsa_use_hint_32_avx2_mask:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_mldsa_use_hint_32_avx2_vsl:
 .long	0x0000001f,0x0000001e,0x0000001d,0x0000001c
 .long	0x0000001b,0x0000001a,0x00000019,0x00000018
@@ -33319,6 +33382,11 @@ L_mldsa_use_hint_32_avx2_vsl:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_mldsa_use_hint_32_avx2_one:
 .long	0x00000001,0x00000001,0x00000001,0x00000001
 .long	0x00000001,0x00000001,0x00000001,0x00000001
diff --git a/wolfcrypt/src/wc_mlkem_asm.S b/wolfcrypt/src/wc_mlkem_asm.S
index 3bc7073a6fd..5eb503c1f53 100644
--- a/wolfcrypt/src/wc_mlkem_asm.S
+++ b/wolfcrypt/src/wc_mlkem_asm.S
@@ -60,14 +60,8 @@
 .p2align	4
 #endif /* __APPLE__ */
 mlkem_q:
-.value	0x0d01,0x0d01
-.value	0x0d01,0x0d01
-.value	0x0d01,0x0d01
-.value	0x0d01,0x0d01
-.value	0x0d01,0x0d01
-.value	0x0d01,0x0d01
-.value	0x0d01,0x0d01
-.value	0x0d01,0x0d01
+.short	0x0d01,0x0d01,0x0d01,0x0d01,0x0d01,0x0d01,0x0d01,0x0d01
+.short	0x0d01,0x0d01,0x0d01,0x0d01,0x0d01,0x0d01,0x0d01,0x0d01
 #ifndef __APPLE__
 .data
 #else
@@ -79,14 +73,8 @@ mlkem_q:
 .p2align	4
 #endif /* __APPLE__ */
 mlkem_qinv:
-.value	0xf301,0xf301
-.value	0xf301,0xf301
-.value	0xf301,0xf301
-.value	0xf301,0xf301
-.value	0xf301,0xf301
-.value	0xf301,0xf301
-.value	0xf301,0xf301
-.value	0xf301,0xf301
+.short	0xf301,0xf301,0xf301,0xf301,0xf301,0xf301,0xf301,0xf301
+.short	0xf301,0xf301,0xf301,0xf301,0xf301,0xf301,0xf301,0xf301
 #ifndef __APPLE__
 .data
 #else
@@ -98,14 +86,8 @@ mlkem_qinv:
 .p2align	4
 #endif /* __APPLE__ */
 mlkem_f:
-.value	0x0549,0x0549
-.value	0x0549,0x0549
-.value	0x0549,0x0549
-.value	0x0549,0x0549
-.value	0x0549,0x0549
-.value	0x0549,0x0549
-.value	0x0549,0x0549
-.value	0x0549,0x0549
+.short	0x0549,0x0549,0x0549,0x0549,0x0549,0x0549,0x0549,0x0549
+.short	0x0549,0x0549,0x0549,0x0549,0x0549,0x0549,0x0549,0x0549
 #ifndef __APPLE__
 .data
 #else
@@ -117,14 +99,8 @@ mlkem_f:
 .p2align	4
 #endif /* __APPLE__ */
 mlkem_f_qinv:
-.value	0x5049,0x5049
-.value	0x5049,0x5049
-.value	0x5049,0x5049
-.value	0x5049,0x5049
-.value	0x5049,0x5049
-.value	0x5049,0x5049
-.value	0x5049,0x5049
-.value	0x5049,0x5049
+.short	0x5049,0x5049,0x5049,0x5049,0x5049,0x5049,0x5049,0x5049
+.short	0x5049,0x5049,0x5049,0x5049,0x5049,0x5049,0x5049,0x5049
 #ifndef __APPLE__
 .data
 #else
@@ -136,14 +112,8 @@ mlkem_f_qinv:
 .p2align	4
 #endif /* __APPLE__ */
 mlkem_v:
-.value	0x4ebf,0x4ebf
-.value	0x4ebf,0x4ebf
-.value	0x4ebf,0x4ebf
-.value	0x4ebf,0x4ebf
-.value	0x4ebf,0x4ebf
-.value	0x4ebf,0x4ebf
-.value	0x4ebf,0x4ebf
-.value	0x4ebf,0x4ebf
+.short	0x4ebf,0x4ebf,0x4ebf,0x4ebf,0x4ebf,0x4ebf,0x4ebf,0x4ebf
+.short	0x4ebf,0x4ebf,0x4ebf,0x4ebf,0x4ebf,0x4ebf,0x4ebf,0x4ebf
 #ifndef __APPLE__
 .data
 #else
@@ -155,630 +125,162 @@ mlkem_v:
 .p2align	4
 #endif /* __APPLE__ */
 L_mlkem_avx2_zetas:
-.value	0x0a0b,0x0a0b
-.value	0x0a0b,0x0a0b
-.value	0x0a0b,0x0a0b
-.value	0x0a0b,0x0a0b
-.value	0x0a0b,0x0a0b
-.value	0x0a0b,0x0a0b
-.value	0x0a0b,0x0a0b
-.value	0x0a0b,0x0a0b
-.value	0x7b0b,0x7b0b
-.value	0x7b0b,0x7b0b
-.value	0x7b0b,0x7b0b
-.value	0x7b0b,0x7b0b
-.value	0x7b0b,0x7b0b
-.value	0x7b0b,0x7b0b
-.value	0x7b0b,0x7b0b
-.value	0x7b0b,0x7b0b
-.value	0x0b9a,0x0b9a
-.value	0x0b9a,0x0b9a
-.value	0x0b9a,0x0b9a
-.value	0x0b9a,0x0b9a
-.value	0x0b9a,0x0b9a
-.value	0x0b9a,0x0b9a
-.value	0x0b9a,0x0b9a
-.value	0x0b9a,0x0b9a
-.value	0x399a,0x399a
-.value	0x399a,0x399a
-.value	0x399a,0x399a
-.value	0x399a,0x399a
-.value	0x399a,0x399a
-.value	0x399a,0x399a
-.value	0x399a,0x399a
-.value	0x399a,0x399a
-.value	0x05d5,0x05d5
-.value	0x05d5,0x05d5
-.value	0x05d5,0x05d5
-.value	0x05d5,0x05d5
-.value	0x05d5,0x05d5
-.value	0x05d5,0x05d5
-.value	0x05d5,0x05d5
-.value	0x05d5,0x05d5
-.value	0x34d5,0x34d5
-.value	0x34d5,0x34d5
-.value	0x34d5,0x34d5
-.value	0x34d5,0x34d5
-.value	0x34d5,0x34d5
-.value	0x34d5,0x34d5
-.value	0x34d5,0x34d5
-.value	0x34d5,0x34d5
-.value	0x058e,0x058e
-.value	0x058e,0x058e
-.value	0x058e,0x058e
-.value	0x058e,0x058e
-.value	0x058e,0x058e
-.value	0x058e,0x058e
-.value	0x058e,0x058e
-.value	0x058e,0x058e
-.value	0xcf8e,0xcf8e
-.value	0xcf8e,0xcf8e
-.value	0xcf8e,0xcf8e
-.value	0xcf8e,0xcf8e
-.value	0xcf8e,0xcf8e
-.value	0xcf8e,0xcf8e
-.value	0xcf8e,0xcf8e
-.value	0xcf8e,0xcf8e
-.value	0x0c56,0x0c56
-.value	0x0c56,0x0c56
-.value	0x0c56,0x0c56
-.value	0x0c56,0x0c56
-.value	0x0c56,0x0c56
-.value	0x0c56,0x0c56
-.value	0x0c56,0x0c56
-.value	0x0c56,0x0c56
-.value	0xae56,0xae56
-.value	0xae56,0xae56
-.value	0xae56,0xae56
-.value	0xae56,0xae56
-.value	0xae56,0xae56
-.value	0xae56,0xae56
-.value	0xae56,0xae56
-.value	0xae56,0xae56
-.value	0x026e,0x026e
-.value	0x026e,0x026e
-.value	0x026e,0x026e
-.value	0x026e,0x026e
-.value	0x026e,0x026e
-.value	0x026e,0x026e
-.value	0x026e,0x026e
-.value	0x026e,0x026e
-.value	0x6c6e,0x6c6e
-.value	0x6c6e,0x6c6e
-.value	0x6c6e,0x6c6e
-.value	0x6c6e,0x6c6e
-.value	0x6c6e,0x6c6e
-.value	0x6c6e,0x6c6e
-.value	0x6c6e,0x6c6e
-.value	0x6c6e,0x6c6e
-.value	0x0629,0x0629
-.value	0x0629,0x0629
-.value	0x0629,0x0629
-.value	0x0629,0x0629
-.value	0x0629,0x0629
-.value	0x0629,0x0629
-.value	0x0629,0x0629
-.value	0x0629,0x0629
-.value	0xf129,0xf129
-.value	0xf129,0xf129
-.value	0xf129,0xf129
-.value	0xf129,0xf129
-.value	0xf129,0xf129
-.value	0xf129,0xf129
-.value	0xf129,0xf129
-.value	0xf129,0xf129
-.value	0x00b6,0x00b6
-.value	0x00b6,0x00b6
-.value	0x00b6,0x00b6
-.value	0x00b6,0x00b6
-.value	0x00b6,0x00b6
-.value	0x00b6,0x00b6
-.value	0x00b6,0x00b6
-.value	0x00b6,0x00b6
-.value	0xc2b6,0xc2b6
-.value	0xc2b6,0xc2b6
-.value	0xc2b6,0xc2b6
-.value	0xc2b6,0xc2b6
-.value	0xc2b6,0xc2b6
-.value	0xc2b6,0xc2b6
-.value	0xc2b6,0xc2b6
-.value	0xc2b6,0xc2b6
-.value	0x023d,0x023d
-.value	0x023d,0x023d
-.value	0x023d,0x023d
-.value	0x023d,0x023d
-.value	0x07d4,0x07d4
-.value	0x07d4,0x07d4
-.value	0x07d4,0x07d4
-.value	0x07d4,0x07d4
-.value	0xe93d,0xe93d
-.value	0xe93d,0xe93d
-.value	0xe93d,0xe93d
-.value	0xe93d,0xe93d
-.value	0x43d4,0x43d4
-.value	0x43d4,0x43d4
-.value	0x43d4,0x43d4
-.value	0x43d4,0x43d4
-.value	0x0108,0x0108
-.value	0x0108,0x0108
-.value	0x0108,0x0108
-.value	0x0108,0x0108
-.value	0x017f,0x017f
-.value	0x017f,0x017f
-.value	0x017f,0x017f
-.value	0x017f,0x017f
-.value	0x9908,0x9908
-.value	0x9908,0x9908
-.value	0x9908,0x9908
-.value	0x9908,0x9908
-.value	0x8e7f,0x8e7f
-.value	0x8e7f,0x8e7f
-.value	0x8e7f,0x8e7f
-.value	0x8e7f,0x8e7f
-.value	0x04c7,0x04c7
-.value	0x04c7,0x04c7
-.value	0x028c,0x028c
-.value	0x028c,0x028c
-.value	0x0ad9,0x0ad9
-.value	0x0ad9,0x0ad9
-.value	0x03f7,0x03f7
-.value	0x03f7,0x03f7
-.value	0xe9c7,0xe9c7
-.value	0xe9c7,0xe9c7
-.value	0xe68c,0xe68c
-.value	0xe68c,0xe68c
-.value	0x05d9,0x05d9
-.value	0x05d9,0x05d9
-.value	0x78f7,0x78f7
-.value	0x78f7,0x78f7
-.value	0x07f4,0x07f4
-.value	0x07f4,0x07f4
-.value	0x05d3,0x05d3
-.value	0x05d3,0x05d3
-.value	0x0be7,0x0be7
-.value	0x0be7,0x0be7
-.value	0x06f9,0x06f9
-.value	0x06f9,0x06f9
-.value	0xa3f4,0xa3f4
-.value	0xa3f4,0xa3f4
-.value	0x4ed3,0x4ed3
-.value	0x4ed3,0x4ed3
-.value	0x50e7,0x50e7
-.value	0x50e7,0x50e7
-.value	0x61f9,0x61f9
-.value	0x61f9,0x61f9
-.value	0x09c4,0x09c4
-.value	0x09c4,0x09c4
-.value	0x09c4,0x09c4
-.value	0x09c4,0x09c4
-.value	0x05b2,0x05b2
-.value	0x05b2,0x05b2
-.value	0x05b2,0x05b2
-.value	0x05b2,0x05b2
-.value	0x15c4,0x15c4
-.value	0x15c4,0x15c4
-.value	0x15c4,0x15c4
-.value	0x15c4,0x15c4
-.value	0xfbb2,0xfbb2
-.value	0xfbb2,0xfbb2
-.value	0xfbb2,0xfbb2
-.value	0xfbb2,0xfbb2
-.value	0x06bf,0x06bf
-.value	0x06bf,0x06bf
-.value	0x06bf,0x06bf
-.value	0x06bf,0x06bf
-.value	0x0c7f,0x0c7f
-.value	0x0c7f,0x0c7f
-.value	0x0c7f,0x0c7f
-.value	0x0c7f,0x0c7f
-.value	0x53bf,0x53bf
-.value	0x53bf,0x53bf
-.value	0x53bf,0x53bf
-.value	0x53bf,0x53bf
-.value	0x997f,0x997f
-.value	0x997f,0x997f
-.value	0x997f,0x997f
-.value	0x997f,0x997f
-.value	0x0204,0x0204
-.value	0x0204,0x0204
-.value	0x0cf9,0x0cf9
-.value	0x0cf9,0x0cf9
-.value	0x0bc1,0x0bc1
-.value	0x0bc1,0x0bc1
-.value	0x0a67,0x0a67
-.value	0x0a67,0x0a67
-.value	0xce04,0xce04
-.value	0xce04,0xce04
-.value	0x67f9,0x67f9
-.value	0x67f9,0x67f9
-.value	0x3ec1,0x3ec1
-.value	0x3ec1,0x3ec1
-.value	0xcf67,0xcf67
-.value	0xcf67,0xcf67
-.value	0x06af,0x06af
-.value	0x06af,0x06af
-.value	0x0877,0x0877
-.value	0x0877,0x0877
-.value	0x007e,0x007e
-.value	0x007e,0x007e
-.value	0x05bd,0x05bd
-.value	0x05bd,0x05bd
-.value	0x23af,0x23af
-.value	0x23af,0x23af
-.value	0xfd77,0xfd77
-.value	0xfd77,0xfd77
-.value	0x9a7e,0x9a7e
-.value	0x9a7e,0x9a7e
-.value	0x6cbd,0x6cbd
-.value	0x6cbd,0x6cbd
-.value	0x08b2,0x08b2
-.value	0x01ae,0x01ae
-.value	0x022b,0x022b
-.value	0x034b,0x034b
-.value	0x081e,0x081e
-.value	0x0367,0x0367
-.value	0x060e,0x060e
-.value	0x0069,0x0069
-.value	0xfeb2,0xfeb2
-.value	0x2bae,0x2bae
-.value	0xd32b,0xd32b
-.value	0x344b,0x344b
-.value	0x821e,0x821e
-.value	0xc867,0xc867
-.value	0x500e,0x500e
-.value	0xab69,0xab69
-.value	0x01a6,0x01a6
-.value	0x024b,0x024b
-.value	0x00b1,0x00b1
-.value	0x0c16,0x0c16
-.value	0x0bde,0x0bde
-.value	0x0b35,0x0b35
-.value	0x0626,0x0626
-.value	0x0675,0x0675
-.value	0x93a6,0x93a6
-.value	0x334b,0x334b
-.value	0x03b1,0x03b1
-.value	0xee16,0xee16
-.value	0xc5de,0xc5de
-.value	0x5a35,0x5a35
-.value	0x1826,0x1826
-.value	0x1575,0x1575
-.value	0x0c0b,0x0c0b
-.value	0x030a,0x030a
-.value	0x0487,0x0487
-.value	0x0c6e,0x0c6e
-.value	0x09f8,0x09f8
-.value	0x05cb,0x05cb
-.value	0x0aa7,0x0aa7
-.value	0x045f,0x045f
-.value	0x7d0b,0x7d0b
-.value	0x810a,0x810a
-.value	0x2987,0x2987
-.value	0x766e,0x766e
-.value	0x71f8,0x71f8
-.value	0xb6cb,0xb6cb
-.value	0x8fa7,0x8fa7
-.value	0x315f,0x315f
-.value	0x06cb,0x06cb
-.value	0x0284,0x0284
-.value	0x0999,0x0999
-.value	0x015d,0x015d
-.value	0x01a2,0x01a2
-.value	0x0149,0x0149
-.value	0x0c65,0x0c65
-.value	0x0cb6,0x0cb6
-.value	0xb7cb,0xb7cb
-.value	0x4e84,0x4e84
-.value	0x4499,0x4499
-.value	0x485d,0x485d
-.value	0xc7a2,0xc7a2
-.value	0x4c49,0x4c49
-.value	0xeb65,0xeb65
-.value	0xceb6,0xceb6
-.value	0x0714,0x0714
-.value	0x0714,0x0714
-.value	0x0714,0x0714
-.value	0x0714,0x0714
-.value	0x0714,0x0714
-.value	0x0714,0x0714
-.value	0x0714,0x0714
-.value	0x0714,0x0714
-.value	0x0314,0x0314
-.value	0x0314,0x0314
-.value	0x0314,0x0314
-.value	0x0314,0x0314
-.value	0x0314,0x0314
-.value	0x0314,0x0314
-.value	0x0314,0x0314
-.value	0x0314,0x0314
-.value	0x011f,0x011f
-.value	0x011f,0x011f
-.value	0x011f,0x011f
-.value	0x011f,0x011f
-.value	0x011f,0x011f
-.value	0x011f,0x011f
-.value	0x011f,0x011f
-.value	0x011f,0x011f
-.value	0x6e1f,0x6e1f
-.value	0x6e1f,0x6e1f
-.value	0x6e1f,0x6e1f
-.value	0x6e1f,0x6e1f
-.value	0x6e1f,0x6e1f
-.value	0x6e1f,0x6e1f
-.value	0x6e1f,0x6e1f
-.value	0x6e1f,0x6e1f
-.value	0x00ca,0x00ca
-.value	0x00ca,0x00ca
-.value	0x00ca,0x00ca
-.value	0x00ca,0x00ca
-.value	0x00ca,0x00ca
-.value	0x00ca,0x00ca
-.value	0x00ca,0x00ca
-.value	0x00ca,0x00ca
-.value	0xbeca,0xbeca
-.value	0xbeca,0xbeca
-.value	0xbeca,0xbeca
-.value	0xbeca,0xbeca
-.value	0xbeca,0xbeca
-.value	0xbeca,0xbeca
-.value	0xbeca,0xbeca
-.value	0xbeca,0xbeca
-.value	0x03c2,0x03c2
-.value	0x03c2,0x03c2
-.value	0x03c2,0x03c2
-.value	0x03c2,0x03c2
-.value	0x03c2,0x03c2
-.value	0x03c2,0x03c2
-.value	0x03c2,0x03c2
-.value	0x03c2,0x03c2
-.value	0x29c2,0x29c2
-.value	0x29c2,0x29c2
-.value	0x29c2,0x29c2
-.value	0x29c2,0x29c2
-.value	0x29c2,0x29c2
-.value	0x29c2,0x29c2
-.value	0x29c2,0x29c2
-.value	0x29c2,0x29c2
-.value	0x084f,0x084f
-.value	0x084f,0x084f
-.value	0x084f,0x084f
-.value	0x084f,0x084f
-.value	0x084f,0x084f
-.value	0x084f,0x084f
-.value	0x084f,0x084f
-.value	0x084f,0x084f
-.value	0x054f,0x054f
-.value	0x054f,0x054f
-.value	0x054f,0x054f
-.value	0x054f,0x054f
-.value	0x054f,0x054f
-.value	0x054f,0x054f
-.value	0x054f,0x054f
-.value	0x054f,0x054f
-.value	0x073f,0x073f
-.value	0x073f,0x073f
-.value	0x073f,0x073f
-.value	0x073f,0x073f
-.value	0x073f,0x073f
-.value	0x073f,0x073f
-.value	0x073f,0x073f
-.value	0x073f,0x073f
-.value	0xd43f,0xd43f
-.value	0xd43f,0xd43f
-.value	0xd43f,0xd43f
-.value	0xd43f,0xd43f
-.value	0xd43f,0xd43f
-.value	0xd43f,0xd43f
-.value	0xd43f,0xd43f
-.value	0xd43f,0xd43f
-.value	0x05bc,0x05bc
-.value	0x05bc,0x05bc
-.value	0x05bc,0x05bc
-.value	0x05bc,0x05bc
-.value	0x05bc,0x05bc
-.value	0x05bc,0x05bc
-.value	0x05bc,0x05bc
-.value	0x05bc,0x05bc
-.value	0x79bc,0x79bc
-.value	0x79bc,0x79bc
-.value	0x79bc,0x79bc
-.value	0x79bc,0x79bc
-.value	0x79bc,0x79bc
-.value	0x79bc,0x79bc
-.value	0x79bc,0x79bc
-.value	0x79bc,0x79bc
-.value	0x0a58,0x0a58
-.value	0x0a58,0x0a58
-.value	0x0a58,0x0a58
-.value	0x0a58,0x0a58
-.value	0x03f9,0x03f9
-.value	0x03f9,0x03f9
-.value	0x03f9,0x03f9
-.value	0x03f9,0x03f9
-.value	0x9258,0x9258
-.value	0x9258,0x9258
-.value	0x9258,0x9258
-.value	0x9258,0x9258
-.value	0x5ef9,0x5ef9
-.value	0x5ef9,0x5ef9
-.value	0x5ef9,0x5ef9
-.value	0x5ef9,0x5ef9
-.value	0x02dc,0x02dc
-.value	0x02dc,0x02dc
-.value	0x02dc,0x02dc
-.value	0x02dc,0x02dc
-.value	0x0260,0x0260
-.value	0x0260,0x0260
-.value	0x0260,0x0260
-.value	0x0260,0x0260
-.value	0xd6dc,0xd6dc
-.value	0xd6dc,0xd6dc
-.value	0xd6dc,0xd6dc
-.value	0xd6dc,0xd6dc
-.value	0x2260,0x2260
-.value	0x2260,0x2260
-.value	0x2260,0x2260
-.value	0x2260,0x2260
-.value	0x09ac,0x09ac
-.value	0x09ac,0x09ac
-.value	0x0ca7,0x0ca7
-.value	0x0ca7,0x0ca7
-.value	0x0bf2,0x0bf2
-.value	0x0bf2,0x0bf2
-.value	0x033e,0x033e
-.value	0x033e,0x033e
-.value	0x4dac,0x4dac
-.value	0x4dac,0x4dac
-.value	0x91a7,0x91a7
-.value	0x91a7,0x91a7
-.value	0xc1f2,0xc1f2
-.value	0xc1f2,0xc1f2
-.value	0xdd3e,0xdd3e
-.value	0xdd3e,0xdd3e
-.value	0x006b,0x006b
-.value	0x006b,0x006b
-.value	0x0774,0x0774
-.value	0x0774,0x0774
-.value	0x0c0a,0x0c0a
-.value	0x0c0a,0x0c0a
-.value	0x094a,0x094a
-.value	0x094a,0x094a
-.value	0x916b,0x916b
-.value	0x916b,0x916b
-.value	0x2374,0x2374
-.value	0x2374,0x2374
-.value	0x8a0a,0x8a0a
-.value	0x8a0a,0x8a0a
-.value	0x474a,0x474a
-.value	0x474a,0x474a
-.value	0x06fb,0x06fb
-.value	0x06fb,0x06fb
-.value	0x06fb,0x06fb
-.value	0x06fb,0x06fb
-.value	0x019b,0x019b
-.value	0x019b,0x019b
-.value	0x019b,0x019b
-.value	0x019b,0x019b
-.value	0x47fb,0x47fb
-.value	0x47fb,0x47fb
-.value	0x47fb,0x47fb
-.value	0x47fb,0x47fb
-.value	0x229b,0x229b
-.value	0x229b,0x229b
-.value	0x229b,0x229b
-.value	0x229b,0x229b
-.value	0x0c34,0x0c34
-.value	0x0c34,0x0c34
-.value	0x0c34,0x0c34
-.value	0x0c34,0x0c34
-.value	0x06de,0x06de
-.value	0x06de,0x06de
-.value	0x06de,0x06de
-.value	0x06de,0x06de
-.value	0x6834,0x6834
-.value	0x6834,0x6834
-.value	0x6834,0x6834
-.value	0x6834,0x6834
-.value	0xc0de,0xc0de
-.value	0xc0de,0xc0de
-.value	0xc0de,0xc0de
-.value	0xc0de,0xc0de
-.value	0x0b73,0x0b73
-.value	0x0b73,0x0b73
-.value	0x03c1,0x03c1
-.value	0x03c1,0x03c1
-.value	0x071d,0x071d
-.value	0x071d,0x071d
-.value	0x0a2c,0x0a2c
-.value	0x0a2c,0x0a2c
-.value	0x3473,0x3473
-.value	0x3473,0x3473
-.value	0x36c1,0x36c1
-.value	0x36c1,0x36c1
-.value	0x8e1d,0x8e1d
-.value	0x8e1d,0x8e1d
-.value	0xce2c,0xce2c
-.value	0xce2c,0xce2c
-.value	0x01c0,0x01c0
-.value	0x01c0,0x01c0
-.value	0x08d8,0x08d8
-.value	0x08d8,0x08d8
-.value	0x02a5,0x02a5
-.value	0x02a5,0x02a5
-.value	0x0806,0x0806
-.value	0x0806,0x0806
-.value	0x41c0,0x41c0
-.value	0x41c0,0x41c0
-.value	0x10d8,0x10d8
-.value	0x10d8,0x10d8
-.value	0xa1a5,0xa1a5
-.value	0xa1a5,0xa1a5
-.value	0xba06,0xba06
-.value	0xba06,0xba06
-.value	0x0331,0x0331
-.value	0x0449,0x0449
-.value	0x025b,0x025b
-.value	0x0262,0x0262
-.value	0x052a,0x052a
-.value	0x07fc,0x07fc
-.value	0x0748,0x0748
-.value	0x0180,0x0180
-.value	0x8631,0x8631
-.value	0x4f49,0x4f49
-.value	0x635b,0x635b
-.value	0x0862,0x0862
-.value	0xe32a,0xe32a
-.value	0x3bfc,0x3bfc
-.value	0x5f48,0x5f48
-.value	0x8180,0x8180
-.value	0x0842,0x0842
-.value	0x0c79,0x0c79
-.value	0x04c2,0x04c2
-.value	0x07ca,0x07ca
-.value	0x0997,0x0997
-.value	0x00dc,0x00dc
-.value	0x085e,0x085e
-.value	0x0686,0x0686
-.value	0xae42,0xae42
-.value	0xe779,0xe779
-.value	0x2ac2,0x2ac2
-.value	0xc5ca,0xc5ca
-.value	0x5e97,0x5e97
-.value	0xd4dc,0xd4dc
-.value	0x425e,0x425e
-.value	0x3886,0x3886
-.value	0x0860,0x0860
-.value	0x0707,0x0707
-.value	0x0803,0x0803
-.value	0x031a,0x031a
-.value	0x071b,0x071b
-.value	0x09ab,0x09ab
-.value	0x099b,0x099b
-.value	0x01de,0x01de
-.value	0x2860,0x2860
-.value	0xac07,0xac07
-.value	0xe103,0xe103
-.value	0xb11a,0xb11a
-.value	0xa81b,0xa81b
-.value	0x5aab,0x5aab
-.value	0x2a9b,0x2a9b
-.value	0xbbde,0xbbde
-.value	0x0c95,0x0c95
-.value	0x0bcd,0x0bcd
-.value	0x03e4,0x03e4
-.value	0x03df,0x03df
-.value	0x03be,0x03be
-.value	0x074d,0x074d
-.value	0x05f2,0x05f2
-.value	0x065c,0x065c
-.value	0x7b95,0x7b95
-.value	0xa2cd,0xa2cd
-.value	0x6fe4,0x6fe4
-.value	0xb0df,0xb0df
-.value	0x5dbe,0x5dbe
-.value	0x1e4d,0x1e4d
-.value	0xbbf2,0xbbf2
-.value	0x5a5c,0x5a5c
+.short	0x0a0b,0x0a0b,0x0a0b,0x0a0b,0x0a0b,0x0a0b,0x0a0b,0x0a0b
+.short	0x0a0b,0x0a0b,0x0a0b,0x0a0b,0x0a0b,0x0a0b,0x0a0b,0x0a0b
+.short	0x7b0b,0x7b0b,0x7b0b,0x7b0b,0x7b0b,0x7b0b,0x7b0b,0x7b0b
+.short	0x7b0b,0x7b0b,0x7b0b,0x7b0b,0x7b0b,0x7b0b,0x7b0b,0x7b0b
+.short	0x0b9a,0x0b9a,0x0b9a,0x0b9a,0x0b9a,0x0b9a,0x0b9a,0x0b9a
+.short	0x0b9a,0x0b9a,0x0b9a,0x0b9a,0x0b9a,0x0b9a,0x0b9a,0x0b9a
+.short	0x399a,0x399a,0x399a,0x399a,0x399a,0x399a,0x399a,0x399a
+.short	0x399a,0x399a,0x399a,0x399a,0x399a,0x399a,0x399a,0x399a
+.short	0x05d5,0x05d5,0x05d5,0x05d5,0x05d5,0x05d5,0x05d5,0x05d5
+.short	0x05d5,0x05d5,0x05d5,0x05d5,0x05d5,0x05d5,0x05d5,0x05d5
+.short	0x34d5,0x34d5,0x34d5,0x34d5,0x34d5,0x34d5,0x34d5,0x34d5
+.short	0x34d5,0x34d5,0x34d5,0x34d5,0x34d5,0x34d5,0x34d5,0x34d5
+.short	0x058e,0x058e,0x058e,0x058e,0x058e,0x058e,0x058e,0x058e
+.short	0x058e,0x058e,0x058e,0x058e,0x058e,0x058e,0x058e,0x058e
+.short	0xcf8e,0xcf8e,0xcf8e,0xcf8e,0xcf8e,0xcf8e,0xcf8e,0xcf8e
+.short	0xcf8e,0xcf8e,0xcf8e,0xcf8e,0xcf8e,0xcf8e,0xcf8e,0xcf8e
+.short	0x0c56,0x0c56,0x0c56,0x0c56,0x0c56,0x0c56,0x0c56,0x0c56
+.short	0x0c56,0x0c56,0x0c56,0x0c56,0x0c56,0x0c56,0x0c56,0x0c56
+.short	0xae56,0xae56,0xae56,0xae56,0xae56,0xae56,0xae56,0xae56
+.short	0xae56,0xae56,0xae56,0xae56,0xae56,0xae56,0xae56,0xae56
+.short	0x026e,0x026e,0x026e,0x026e,0x026e,0x026e,0x026e,0x026e
+.short	0x026e,0x026e,0x026e,0x026e,0x026e,0x026e,0x026e,0x026e
+.short	0x6c6e,0x6c6e,0x6c6e,0x6c6e,0x6c6e,0x6c6e,0x6c6e,0x6c6e
+.short	0x6c6e,0x6c6e,0x6c6e,0x6c6e,0x6c6e,0x6c6e,0x6c6e,0x6c6e
+.short	0x0629,0x0629,0x0629,0x0629,0x0629,0x0629,0x0629,0x0629
+.short	0x0629,0x0629,0x0629,0x0629,0x0629,0x0629,0x0629,0x0629
+.short	0xf129,0xf129,0xf129,0xf129,0xf129,0xf129,0xf129,0xf129
+.short	0xf129,0xf129,0xf129,0xf129,0xf129,0xf129,0xf129,0xf129
+.short	0x00b6,0x00b6,0x00b6,0x00b6,0x00b6,0x00b6,0x00b6,0x00b6
+.short	0x00b6,0x00b6,0x00b6,0x00b6,0x00b6,0x00b6,0x00b6,0x00b6
+.short	0xc2b6,0xc2b6,0xc2b6,0xc2b6,0xc2b6,0xc2b6,0xc2b6,0xc2b6
+.short	0xc2b6,0xc2b6,0xc2b6,0xc2b6,0xc2b6,0xc2b6,0xc2b6,0xc2b6
+.short	0x023d,0x023d,0x023d,0x023d,0x023d,0x023d,0x023d,0x023d
+.short	0x07d4,0x07d4,0x07d4,0x07d4,0x07d4,0x07d4,0x07d4,0x07d4
+.short	0xe93d,0xe93d,0xe93d,0xe93d,0xe93d,0xe93d,0xe93d,0xe93d
+.short	0x43d4,0x43d4,0x43d4,0x43d4,0x43d4,0x43d4,0x43d4,0x43d4
+.short	0x0108,0x0108,0x0108,0x0108,0x0108,0x0108,0x0108,0x0108
+.short	0x017f,0x017f,0x017f,0x017f,0x017f,0x017f,0x017f,0x017f
+.short	0x9908,0x9908,0x9908,0x9908,0x9908,0x9908,0x9908,0x9908
+.short	0x8e7f,0x8e7f,0x8e7f,0x8e7f,0x8e7f,0x8e7f,0x8e7f,0x8e7f
+.short	0x04c7,0x04c7,0x04c7,0x04c7,0x028c,0x028c,0x028c,0x028c
+.short	0x0ad9,0x0ad9,0x0ad9,0x0ad9,0x03f7,0x03f7,0x03f7,0x03f7
+.short	0xe9c7,0xe9c7,0xe9c7,0xe9c7,0xe68c,0xe68c,0xe68c,0xe68c
+.short	0x05d9,0x05d9,0x05d9,0x05d9,0x78f7,0x78f7,0x78f7,0x78f7
+.short	0x07f4,0x07f4,0x07f4,0x07f4,0x05d3,0x05d3,0x05d3,0x05d3
+.short	0x0be7,0x0be7,0x0be7,0x0be7,0x06f9,0x06f9,0x06f9,0x06f9
+.short	0xa3f4,0xa3f4,0xa3f4,0xa3f4,0x4ed3,0x4ed3,0x4ed3,0x4ed3
+.short	0x50e7,0x50e7,0x50e7,0x50e7,0x61f9,0x61f9,0x61f9,0x61f9
+.short	0x09c4,0x09c4,0x09c4,0x09c4,0x09c4,0x09c4,0x09c4,0x09c4
+.short	0x05b2,0x05b2,0x05b2,0x05b2,0x05b2,0x05b2,0x05b2,0x05b2
+.short	0x15c4,0x15c4,0x15c4,0x15c4,0x15c4,0x15c4,0x15c4,0x15c4
+.short	0xfbb2,0xfbb2,0xfbb2,0xfbb2,0xfbb2,0xfbb2,0xfbb2,0xfbb2
+.short	0x06bf,0x06bf,0x06bf,0x06bf,0x06bf,0x06bf,0x06bf,0x06bf
+.short	0x0c7f,0x0c7f,0x0c7f,0x0c7f,0x0c7f,0x0c7f,0x0c7f,0x0c7f
+.short	0x53bf,0x53bf,0x53bf,0x53bf,0x53bf,0x53bf,0x53bf,0x53bf
+.short	0x997f,0x997f,0x997f,0x997f,0x997f,0x997f,0x997f,0x997f
+.short	0x0204,0x0204,0x0204,0x0204,0x0cf9,0x0cf9,0x0cf9,0x0cf9
+.short	0x0bc1,0x0bc1,0x0bc1,0x0bc1,0x0a67,0x0a67,0x0a67,0x0a67
+.short	0xce04,0xce04,0xce04,0xce04,0x67f9,0x67f9,0x67f9,0x67f9
+.short	0x3ec1,0x3ec1,0x3ec1,0x3ec1,0xcf67,0xcf67,0xcf67,0xcf67
+.short	0x06af,0x06af,0x06af,0x06af,0x0877,0x0877,0x0877,0x0877
+.short	0x007e,0x007e,0x007e,0x007e,0x05bd,0x05bd,0x05bd,0x05bd
+.short	0x23af,0x23af,0x23af,0x23af,0xfd77,0xfd77,0xfd77,0xfd77
+.short	0x9a7e,0x9a7e,0x9a7e,0x9a7e,0x6cbd,0x6cbd,0x6cbd,0x6cbd
+.short	0x08b2,0x08b2,0x01ae,0x01ae,0x022b,0x022b,0x034b,0x034b
+.short	0x081e,0x081e,0x0367,0x0367,0x060e,0x060e,0x0069,0x0069
+.short	0xfeb2,0xfeb2,0x2bae,0x2bae,0xd32b,0xd32b,0x344b,0x344b
+.short	0x821e,0x821e,0xc867,0xc867,0x500e,0x500e,0xab69,0xab69
+.short	0x01a6,0x01a6,0x024b,0x024b,0x00b1,0x00b1,0x0c16,0x0c16
+.short	0x0bde,0x0bde,0x0b35,0x0b35,0x0626,0x0626,0x0675,0x0675
+.short	0x93a6,0x93a6,0x334b,0x334b,0x03b1,0x03b1,0xee16,0xee16
+.short	0xc5de,0xc5de,0x5a35,0x5a35,0x1826,0x1826,0x1575,0x1575
+.short	0x0c0b,0x0c0b,0x030a,0x030a,0x0487,0x0487,0x0c6e,0x0c6e
+.short	0x09f8,0x09f8,0x05cb,0x05cb,0x0aa7,0x0aa7,0x045f,0x045f
+.short	0x7d0b,0x7d0b,0x810a,0x810a,0x2987,0x2987,0x766e,0x766e
+.short	0x71f8,0x71f8,0xb6cb,0xb6cb,0x8fa7,0x8fa7,0x315f,0x315f
+.short	0x06cb,0x06cb,0x0284,0x0284,0x0999,0x0999,0x015d,0x015d
+.short	0x01a2,0x01a2,0x0149,0x0149,0x0c65,0x0c65,0x0cb6,0x0cb6
+.short	0xb7cb,0xb7cb,0x4e84,0x4e84,0x4499,0x4499,0x485d,0x485d
+.short	0xc7a2,0xc7a2,0x4c49,0x4c49,0xeb65,0xeb65,0xceb6,0xceb6
+.short	0x0714,0x0714,0x0714,0x0714,0x0714,0x0714,0x0714,0x0714
+.short	0x0714,0x0714,0x0714,0x0714,0x0714,0x0714,0x0714,0x0714
+.short	0x0314,0x0314,0x0314,0x0314,0x0314,0x0314,0x0314,0x0314
+.short	0x0314,0x0314,0x0314,0x0314,0x0314,0x0314,0x0314,0x0314
+.short	0x011f,0x011f,0x011f,0x011f,0x011f,0x011f,0x011f,0x011f
+.short	0x011f,0x011f,0x011f,0x011f,0x011f,0x011f,0x011f,0x011f
+.short	0x6e1f,0x6e1f,0x6e1f,0x6e1f,0x6e1f,0x6e1f,0x6e1f,0x6e1f
+.short	0x6e1f,0x6e1f,0x6e1f,0x6e1f,0x6e1f,0x6e1f,0x6e1f,0x6e1f
+.short	0x00ca,0x00ca,0x00ca,0x00ca,0x00ca,0x00ca,0x00ca,0x00ca
+.short	0x00ca,0x00ca,0x00ca,0x00ca,0x00ca,0x00ca,0x00ca,0x00ca
+.short	0xbeca,0xbeca,0xbeca,0xbeca,0xbeca,0xbeca,0xbeca,0xbeca
+.short	0xbeca,0xbeca,0xbeca,0xbeca,0xbeca,0xbeca,0xbeca,0xbeca
+.short	0x03c2,0x03c2,0x03c2,0x03c2,0x03c2,0x03c2,0x03c2,0x03c2
+.short	0x03c2,0x03c2,0x03c2,0x03c2,0x03c2,0x03c2,0x03c2,0x03c2
+.short	0x29c2,0x29c2,0x29c2,0x29c2,0x29c2,0x29c2,0x29c2,0x29c2
+.short	0x29c2,0x29c2,0x29c2,0x29c2,0x29c2,0x29c2,0x29c2,0x29c2
+.short	0x084f,0x084f,0x084f,0x084f,0x084f,0x084f,0x084f,0x084f
+.short	0x084f,0x084f,0x084f,0x084f,0x084f,0x084f,0x084f,0x084f
+.short	0x054f,0x054f,0x054f,0x054f,0x054f,0x054f,0x054f,0x054f
+.short	0x054f,0x054f,0x054f,0x054f,0x054f,0x054f,0x054f,0x054f
+.short	0x073f,0x073f,0x073f,0x073f,0x073f,0x073f,0x073f,0x073f
+.short	0x073f,0x073f,0x073f,0x073f,0x073f,0x073f,0x073f,0x073f
+.short	0xd43f,0xd43f,0xd43f,0xd43f,0xd43f,0xd43f,0xd43f,0xd43f
+.short	0xd43f,0xd43f,0xd43f,0xd43f,0xd43f,0xd43f,0xd43f,0xd43f
+.short	0x05bc,0x05bc,0x05bc,0x05bc,0x05bc,0x05bc,0x05bc,0x05bc
+.short	0x05bc,0x05bc,0x05bc,0x05bc,0x05bc,0x05bc,0x05bc,0x05bc
+.short	0x79bc,0x79bc,0x79bc,0x79bc,0x79bc,0x79bc,0x79bc,0x79bc
+.short	0x79bc,0x79bc,0x79bc,0x79bc,0x79bc,0x79bc,0x79bc,0x79bc
+.short	0x0a58,0x0a58,0x0a58,0x0a58,0x0a58,0x0a58,0x0a58,0x0a58
+.short	0x03f9,0x03f9,0x03f9,0x03f9,0x03f9,0x03f9,0x03f9,0x03f9
+.short	0x9258,0x9258,0x9258,0x9258,0x9258,0x9258,0x9258,0x9258
+.short	0x5ef9,0x5ef9,0x5ef9,0x5ef9,0x5ef9,0x5ef9,0x5ef9,0x5ef9
+.short	0x02dc,0x02dc,0x02dc,0x02dc,0x02dc,0x02dc,0x02dc,0x02dc
+.short	0x0260,0x0260,0x0260,0x0260,0x0260,0x0260,0x0260,0x0260
+.short	0xd6dc,0xd6dc,0xd6dc,0xd6dc,0xd6dc,0xd6dc,0xd6dc,0xd6dc
+.short	0x2260,0x2260,0x2260,0x2260,0x2260,0x2260,0x2260,0x2260
+.short	0x09ac,0x09ac,0x09ac,0x09ac,0x0ca7,0x0ca7,0x0ca7,0x0ca7
+.short	0x0bf2,0x0bf2,0x0bf2,0x0bf2,0x033e,0x033e,0x033e,0x033e
+.short	0x4dac,0x4dac,0x4dac,0x4dac,0x91a7,0x91a7,0x91a7,0x91a7
+.short	0xc1f2,0xc1f2,0xc1f2,0xc1f2,0xdd3e,0xdd3e,0xdd3e,0xdd3e
+.short	0x006b,0x006b,0x006b,0x006b,0x0774,0x0774,0x0774,0x0774
+.short	0x0c0a,0x0c0a,0x0c0a,0x0c0a,0x094a,0x094a,0x094a,0x094a
+.short	0x916b,0x916b,0x916b,0x916b,0x2374,0x2374,0x2374,0x2374
+.short	0x8a0a,0x8a0a,0x8a0a,0x8a0a,0x474a,0x474a,0x474a,0x474a
+.short	0x06fb,0x06fb,0x06fb,0x06fb,0x06fb,0x06fb,0x06fb,0x06fb
+.short	0x019b,0x019b,0x019b,0x019b,0x019b,0x019b,0x019b,0x019b
+.short	0x47fb,0x47fb,0x47fb,0x47fb,0x47fb,0x47fb,0x47fb,0x47fb
+.short	0x229b,0x229b,0x229b,0x229b,0x229b,0x229b,0x229b,0x229b
+.short	0x0c34,0x0c34,0x0c34,0x0c34,0x0c34,0x0c34,0x0c34,0x0c34
+.short	0x06de,0x06de,0x06de,0x06de,0x06de,0x06de,0x06de,0x06de
+.short	0x6834,0x6834,0x6834,0x6834,0x6834,0x6834,0x6834,0x6834
+.short	0xc0de,0xc0de,0xc0de,0xc0de,0xc0de,0xc0de,0xc0de,0xc0de
+.short	0x0b73,0x0b73,0x0b73,0x0b73,0x03c1,0x03c1,0x03c1,0x03c1
+.short	0x071d,0x071d,0x071d,0x071d,0x0a2c,0x0a2c,0x0a2c,0x0a2c
+.short	0x3473,0x3473,0x3473,0x3473,0x36c1,0x36c1,0x36c1,0x36c1
+.short	0x8e1d,0x8e1d,0x8e1d,0x8e1d,0xce2c,0xce2c,0xce2c,0xce2c
+.short	0x01c0,0x01c0,0x01c0,0x01c0,0x08d8,0x08d8,0x08d8,0x08d8
+.short	0x02a5,0x02a5,0x02a5,0x02a5,0x0806,0x0806,0x0806,0x0806
+.short	0x41c0,0x41c0,0x41c0,0x41c0,0x10d8,0x10d8,0x10d8,0x10d8
+.short	0xa1a5,0xa1a5,0xa1a5,0xa1a5,0xba06,0xba06,0xba06,0xba06
+.short	0x0331,0x0331,0x0449,0x0449,0x025b,0x025b,0x0262,0x0262
+.short	0x052a,0x052a,0x07fc,0x07fc,0x0748,0x0748,0x0180,0x0180
+.short	0x8631,0x8631,0x4f49,0x4f49,0x635b,0x635b,0x0862,0x0862
+.short	0xe32a,0xe32a,0x3bfc,0x3bfc,0x5f48,0x5f48,0x8180,0x8180
+.short	0x0842,0x0842,0x0c79,0x0c79,0x04c2,0x04c2,0x07ca,0x07ca
+.short	0x0997,0x0997,0x00dc,0x00dc,0x085e,0x085e,0x0686,0x0686
+.short	0xae42,0xae42,0xe779,0xe779,0x2ac2,0x2ac2,0xc5ca,0xc5ca
+.short	0x5e97,0x5e97,0xd4dc,0xd4dc,0x425e,0x425e,0x3886,0x3886
+.short	0x0860,0x0860,0x0707,0x0707,0x0803,0x0803,0x031a,0x031a
+.short	0x071b,0x071b,0x09ab,0x09ab,0x099b,0x099b,0x01de,0x01de
+.short	0x2860,0x2860,0xac07,0xac07,0xe103,0xe103,0xb11a,0xb11a
+.short	0xa81b,0xa81b,0x5aab,0x5aab,0x2a9b,0x2a9b,0xbbde,0xbbde
+.short	0x0c95,0x0c95,0x0bcd,0x0bcd,0x03e4,0x03e4,0x03df,0x03df
+.short	0x03be,0x03be,0x074d,0x074d,0x05f2,0x05f2,0x065c,0x065c
+.short	0x7b95,0x7b95,0xa2cd,0xa2cd,0x6fe4,0x6fe4,0xb0df,0xb0df
+.short	0x5dbe,0x5dbe,0x1e4d,0x1e4d,0xbbf2,0xbbf2,0x5a5c,0x5a5c
 #ifndef __APPLE__
 .data
 #else
@@ -790,134 +292,38 @@ L_mlkem_avx2_zetas:
 .p2align	4
 #endif /* __APPLE__ */
 L_mlkem_avx2_zetas_basemul:
-.value	0x08b2,0x081e
-.value	0xf74e,0xf7e2
-.value	0x01ae,0x0367
-.value	0xfe52,0xfc99
-.value	0x022b,0x060e
-.value	0xfdd5,0xf9f2
-.value	0x034b,0x0069
-.value	0xfcb5,0xff97
-.value	0xfeb2,0x821e
-.value	0x014e,0x7de2
-.value	0x2bae,0xc867
-.value	0xd452,0x3799
-.value	0xd32b,0x500e
-.value	0x2cd5,0xaff2
-.value	0x344b,0xab69
-.value	0xcbb5,0x5497
-.value	0x01a6,0x0bde
-.value	0xfe5a,0xf422
-.value	0x024b,0x0b35
-.value	0xfdb5,0xf4cb
-.value	0x00b1,0x0626
-.value	0xff4f,0xf9da
-.value	0x0c16,0x0675
-.value	0xf3ea,0xf98b
-.value	0x93a6,0xc5de
-.value	0x6c5a,0x3a22
-.value	0x334b,0x5a35
-.value	0xccb5,0xa5cb
-.value	0x03b1,0x1826
-.value	0xfc4f,0xe7da
-.value	0xee16,0x1575
-.value	0x11ea,0xea8b
-.value	0x0c0b,0x09f8
-.value	0xf3f5,0xf608
-.value	0x030a,0x05cb
-.value	0xfcf6,0xfa35
-.value	0x0487,0x0aa7
-.value	0xfb79,0xf559
-.value	0x0c6e,0x045f
-.value	0xf392,0xfba1
-.value	0x7d0b,0x71f8
-.value	0x82f5,0x8e08
-.value	0x810a,0xb6cb
-.value	0x7ef6,0x4935
-.value	0x2987,0x8fa7
-.value	0xd679,0x7059
-.value	0x766e,0x315f
-.value	0x8992,0xcea1
-.value	0x06cb,0x01a2
-.value	0xf935,0xfe5e
-.value	0x0284,0x0149
-.value	0xfd7c,0xfeb7
-.value	0x0999,0x0c65
-.value	0xf667,0xf39b
-.value	0x015d,0x0cb6
-.value	0xfea3,0xf34a
-.value	0xb7cb,0xc7a2
-.value	0x4835,0x385e
-.value	0x4e84,0x4c49
-.value	0xb17c,0xb3b7
-.value	0x4499,0xeb65
-.value	0xbb67,0x149b
-.value	0x485d,0xceb6
-.value	0xb7a3,0x314a
-.value	0x0331,0x052a
-.value	0xfccf,0xfad6
-.value	0x0449,0x07fc
-.value	0xfbb7,0xf804
-.value	0x025b,0x0748
-.value	0xfda5,0xf8b8
-.value	0x0262,0x0180
-.value	0xfd9e,0xfe80
-.value	0x8631,0xe32a
-.value	0x79cf,0x1cd6
-.value	0x4f49,0x3bfc
-.value	0xb0b7,0xc404
-.value	0x635b,0x5f48
-.value	0x9ca5,0xa0b8
-.value	0x0862,0x8180
-.value	0xf79e,0x7e80
-.value	0x0842,0x0997
-.value	0xf7be,0xf669
-.value	0x0c79,0x00dc
-.value	0xf387,0xff24
-.value	0x04c2,0x085e
-.value	0xfb3e,0xf7a2
-.value	0x07ca,0x0686
-.value	0xf836,0xf97a
-.value	0xae42,0x5e97
-.value	0x51be,0xa169
-.value	0xe779,0xd4dc
-.value	0x1887,0x2b24
-.value	0x2ac2,0x425e
-.value	0xd53e,0xbda2
-.value	0xc5ca,0x3886
-.value	0x3a36,0xc77a
-.value	0x0860,0x071b
-.value	0xf7a0,0xf8e5
-.value	0x0707,0x09ab
-.value	0xf8f9,0xf655
-.value	0x0803,0x099b
-.value	0xf7fd,0xf665
-.value	0x031a,0x01de
-.value	0xfce6,0xfe22
-.value	0x2860,0xa81b
-.value	0xd7a0,0x57e5
-.value	0xac07,0x5aab
-.value	0x53f9,0xa555
-.value	0xe103,0x2a9b
-.value	0x1efd,0xd565
-.value	0xb11a,0xbbde
-.value	0x4ee6,0x4422
-.value	0x0c95,0x03be
-.value	0xf36b,0xfc42
-.value	0x0bcd,0x074d
-.value	0xf433,0xf8b3
-.value	0x03e4,0x05f2
-.value	0xfc1c,0xfa0e
-.value	0x03df,0x065c
-.value	0xfc21,0xf9a4
-.value	0x7b95,0x5dbe
-.value	0x846b,0xa242
-.value	0xa2cd,0x1e4d
-.value	0x5d33,0xe1b3
-.value	0x6fe4,0xbbf2
-.value	0x901c,0x440e
-.value	0xb0df,0x5a5c
-.value	0x4f21,0xa5a4
+.short	0x08b2,0x081e,0xf74e,0xf7e2,0x01ae,0x0367,0xfe52,0xfc99
+.short	0x022b,0x060e,0xfdd5,0xf9f2,0x034b,0x0069,0xfcb5,0xff97
+.short	0xfeb2,0x821e,0x014e,0x7de2,0x2bae,0xc867,0xd452,0x3799
+.short	0xd32b,0x500e,0x2cd5,0xaff2,0x344b,0xab69,0xcbb5,0x5497
+.short	0x01a6,0x0bde,0xfe5a,0xf422,0x024b,0x0b35,0xfdb5,0xf4cb
+.short	0x00b1,0x0626,0xff4f,0xf9da,0x0c16,0x0675,0xf3ea,0xf98b
+.short	0x93a6,0xc5de,0x6c5a,0x3a22,0x334b,0x5a35,0xccb5,0xa5cb
+.short	0x03b1,0x1826,0xfc4f,0xe7da,0xee16,0x1575,0x11ea,0xea8b
+.short	0x0c0b,0x09f8,0xf3f5,0xf608,0x030a,0x05cb,0xfcf6,0xfa35
+.short	0x0487,0x0aa7,0xfb79,0xf559,0x0c6e,0x045f,0xf392,0xfba1
+.short	0x7d0b,0x71f8,0x82f5,0x8e08,0x810a,0xb6cb,0x7ef6,0x4935
+.short	0x2987,0x8fa7,0xd679,0x7059,0x766e,0x315f,0x8992,0xcea1
+.short	0x06cb,0x01a2,0xf935,0xfe5e,0x0284,0x0149,0xfd7c,0xfeb7
+.short	0x0999,0x0c65,0xf667,0xf39b,0x015d,0x0cb6,0xfea3,0xf34a
+.short	0xb7cb,0xc7a2,0x4835,0x385e,0x4e84,0x4c49,0xb17c,0xb3b7
+.short	0x4499,0xeb65,0xbb67,0x149b,0x485d,0xceb6,0xb7a3,0x314a
+.short	0x0331,0x052a,0xfccf,0xfad6,0x0449,0x07fc,0xfbb7,0xf804
+.short	0x025b,0x0748,0xfda5,0xf8b8,0x0262,0x0180,0xfd9e,0xfe80
+.short	0x8631,0xe32a,0x79cf,0x1cd6,0x4f49,0x3bfc,0xb0b7,0xc404
+.short	0x635b,0x5f48,0x9ca5,0xa0b8,0x0862,0x8180,0xf79e,0x7e80
+.short	0x0842,0x0997,0xf7be,0xf669,0x0c79,0x00dc,0xf387,0xff24
+.short	0x04c2,0x085e,0xfb3e,0xf7a2,0x07ca,0x0686,0xf836,0xf97a
+.short	0xae42,0x5e97,0x51be,0xa169,0xe779,0xd4dc,0x1887,0x2b24
+.short	0x2ac2,0x425e,0xd53e,0xbda2,0xc5ca,0x3886,0x3a36,0xc77a
+.short	0x0860,0x071b,0xf7a0,0xf8e5,0x0707,0x09ab,0xf8f9,0xf655
+.short	0x0803,0x099b,0xf7fd,0xf665,0x031a,0x01de,0xfce6,0xfe22
+.short	0x2860,0xa81b,0xd7a0,0x57e5,0xac07,0x5aab,0x53f9,0xa555
+.short	0xe103,0x2a9b,0x1efd,0xd565,0xb11a,0xbbde,0x4ee6,0x4422
+.short	0x0c95,0x03be,0xf36b,0xfc42,0x0bcd,0x074d,0xf433,0xf8b3
+.short	0x03e4,0x05f2,0xfc1c,0xfa0e,0x03df,0x065c,0xfc21,0xf9a4
+.short	0x7b95,0x5dbe,0x846b,0xa242,0xa2cd,0x1e4d,0x5d33,0xe1b3
+.short	0x6fe4,0xbbf2,0x901c,0x440e,0xb0df,0x5a5c,0x4f21,0xa5a4
 #ifndef __APPLE__
 .data
 #else
@@ -929,646 +335,166 @@ L_mlkem_avx2_zetas_basemul:
 .p2align	4
 #endif /* __APPLE__ */
 L_mlkem_avx2_zetas_inv:
-.value	0x06a5,0x06a5
-.value	0x05b4,0x05b4
-.value	0x070f,0x070f
-.value	0x0943,0x0943
-.value	0x0922,0x0922
-.value	0x0134,0x0134
-.value	0x091d,0x091d
-.value	0x006c,0x006c
-.value	0xa5a5,0xa5a5
-.value	0xe1b4,0xe1b4
-.value	0x440f,0x440f
-.value	0xa243,0xa243
-.value	0x4f22,0x4f22
-.value	0x5d34,0x5d34
-.value	0x901d,0x901d
-.value	0x846c,0x846c
-.value	0x0b23,0x0b23
-.value	0x0356,0x0356
-.value	0x0366,0x0366
-.value	0x05e6,0x05e6
-.value	0x09e7,0x09e7
-.value	0x05fa,0x05fa
-.value	0x04fe,0x04fe
-.value	0x04a1,0x04a1
-.value	0x4423,0x4423
-.value	0xa556,0xa556
-.value	0xd566,0xd566
-.value	0x57e6,0x57e6
-.value	0x4ee7,0x4ee7
-.value	0x53fa,0x53fa
-.value	0x1efe,0x1efe
-.value	0xd7a1,0xd7a1
-.value	0x04fb,0x04fb
-.value	0x04fb,0x04fb
-.value	0x0a5c,0x0a5c
-.value	0x0a5c,0x0a5c
-.value	0x0429,0x0429
-.value	0x0429,0x0429
-.value	0x0b41,0x0b41
-.value	0x0b41,0x0b41
-.value	0x45fb,0x45fb
-.value	0x45fb,0x45fb
-.value	0x5e5c,0x5e5c
-.value	0x5e5c,0x5e5c
-.value	0xef29,0xef29
-.value	0xef29,0xef29
-.value	0xbe41,0xbe41
-.value	0xbe41,0xbe41
-.value	0x02d5,0x02d5
-.value	0x02d5,0x02d5
-.value	0x05e4,0x05e4
-.value	0x05e4,0x05e4
-.value	0x0940,0x0940
-.value	0x0940,0x0940
-.value	0x018e,0x018e
-.value	0x018e,0x018e
-.value	0x31d5,0x31d5
-.value	0x31d5,0x31d5
-.value	0x71e4,0x71e4
-.value	0x71e4,0x71e4
-.value	0xc940,0xc940
-.value	0xc940,0xc940
-.value	0xcb8e,0xcb8e
-.value	0xcb8e,0xcb8e
-.value	0x0623,0x0623
-.value	0x0623,0x0623
-.value	0x0623,0x0623
-.value	0x0623,0x0623
-.value	0x00cd,0x00cd
-.value	0x00cd,0x00cd
-.value	0x00cd,0x00cd
-.value	0x00cd,0x00cd
-.value	0x3f23,0x3f23
-.value	0x3f23,0x3f23
-.value	0x3f23,0x3f23
-.value	0x3f23,0x3f23
-.value	0x97cd,0x97cd
-.value	0x97cd,0x97cd
-.value	0x97cd,0x97cd
-.value	0x97cd,0x97cd
-.value	0x0b66,0x0b66
-.value	0x0b66,0x0b66
-.value	0x0b66,0x0b66
-.value	0x0b66,0x0b66
-.value	0x0606,0x0606
-.value	0x0606,0x0606
-.value	0x0606,0x0606
-.value	0x0606,0x0606
-.value	0xdd66,0xdd66
-.value	0xdd66,0xdd66
-.value	0xdd66,0xdd66
-.value	0xdd66,0xdd66
-.value	0xb806,0xb806
-.value	0xb806,0xb806
-.value	0xb806,0xb806
-.value	0xb806,0xb806
-.value	0x0745,0x0745
-.value	0x0745,0x0745
-.value	0x0745,0x0745
-.value	0x0745,0x0745
-.value	0x0745,0x0745
-.value	0x0745,0x0745
-.value	0x0745,0x0745
-.value	0x0745,0x0745
-.value	0x8645,0x8645
-.value	0x8645,0x8645
-.value	0x8645,0x8645
-.value	0x8645,0x8645
-.value	0x8645,0x8645
-.value	0x8645,0x8645
-.value	0x8645,0x8645
-.value	0x8645,0x8645
-.value	0x05c2,0x05c2
-.value	0x05c2,0x05c2
-.value	0x05c2,0x05c2
-.value	0x05c2,0x05c2
-.value	0x05c2,0x05c2
-.value	0x05c2,0x05c2
-.value	0x05c2,0x05c2
-.value	0x05c2,0x05c2
-.value	0x2bc2,0x2bc2
-.value	0x2bc2,0x2bc2
-.value	0x2bc2,0x2bc2
-.value	0x2bc2,0x2bc2
-.value	0x2bc2,0x2bc2
-.value	0x2bc2,0x2bc2
-.value	0x2bc2,0x2bc2
-.value	0x2bc2,0x2bc2
-.value	0x0c37,0x0c37
-.value	0x0c37,0x0c37
-.value	0x0c37,0x0c37
-.value	0x0c37,0x0c37
-.value	0x0c37,0x0c37
-.value	0x0c37,0x0c37
-.value	0x0c37,0x0c37
-.value	0x0c37,0x0c37
-.value	0x4137,0x4137
-.value	0x4137,0x4137
-.value	0x4137,0x4137
-.value	0x4137,0x4137
-.value	0x4137,0x4137
-.value	0x4137,0x4137
-.value	0x4137,0x4137
-.value	0x4137,0x4137
-.value	0x067b,0x067b
-.value	0x0c25,0x0c25
-.value	0x04a3,0x04a3
-.value	0x036a,0x036a
-.value	0x0537,0x0537
-.value	0x0088,0x0088
-.value	0x083f,0x083f
-.value	0x04bf,0x04bf
-.value	0xc77b,0xc77b
-.value	0x2b25,0x2b25
-.value	0xbda3,0xbda3
-.value	0xa16a,0xa16a
-.value	0x3a37,0x3a37
-.value	0x1888,0x1888
-.value	0xd53f,0xd53f
-.value	0x51bf,0x51bf
-.value	0x0b81,0x0b81
-.value	0x0505,0x0505
-.value	0x05b9,0x05b9
-.value	0x07d7,0x07d7
-.value	0x0a9f,0x0a9f
-.value	0x08b8,0x08b8
-.value	0x0aa6,0x0aa6
-.value	0x09d0,0x09d0
-.value	0x7e81,0x7e81
-.value	0xc405,0xc405
-.value	0xa0b9,0xa0b9
-.value	0x1cd7,0x1cd7
-.value	0xf79f,0xf79f
-.value	0xb0b8,0xb0b8
-.value	0x9ca6,0x9ca6
-.value	0x79d0,0x79d0
-.value	0x03b7,0x03b7
-.value	0x03b7,0x03b7
-.value	0x00f7,0x00f7
-.value	0x00f7,0x00f7
-.value	0x058d,0x058d
-.value	0x058d,0x058d
-.value	0x0c96,0x0c96
-.value	0x0c96,0x0c96
-.value	0xb8b7,0xb8b7
-.value	0xb8b7,0xb8b7
-.value	0x75f7,0x75f7
-.value	0x75f7,0x75f7
-.value	0xdc8d,0xdc8d
-.value	0xdc8d,0xdc8d
-.value	0x6e96,0x6e96
-.value	0x6e96,0x6e96
-.value	0x09c3,0x09c3
-.value	0x09c3,0x09c3
-.value	0x010f,0x010f
-.value	0x010f,0x010f
-.value	0x005a,0x005a
-.value	0x005a,0x005a
-.value	0x0355,0x0355
-.value	0x0355,0x0355
-.value	0x22c3,0x22c3
-.value	0x22c3,0x22c3
-.value	0x3e0f,0x3e0f
-.value	0x3e0f,0x3e0f
-.value	0x6e5a,0x6e5a
-.value	0x6e5a,0x6e5a
-.value	0xb255,0xb255
-.value	0xb255,0xb255
-.value	0x0aa1,0x0aa1
-.value	0x0aa1,0x0aa1
-.value	0x0aa1,0x0aa1
-.value	0x0aa1,0x0aa1
-.value	0x0a25,0x0a25
-.value	0x0a25,0x0a25
-.value	0x0a25,0x0a25
-.value	0x0a25,0x0a25
-.value	0xdda1,0xdda1
-.value	0xdda1,0xdda1
-.value	0xdda1,0xdda1
-.value	0xdda1,0xdda1
-.value	0x2925,0x2925
-.value	0x2925,0x2925
-.value	0x2925,0x2925
-.value	0x2925,0x2925
-.value	0x0908,0x0908
-.value	0x0908,0x0908
-.value	0x0908,0x0908
-.value	0x0908,0x0908
-.value	0x02a9,0x02a9
-.value	0x02a9,0x02a9
-.value	0x02a9,0x02a9
-.value	0x02a9,0x02a9
-.value	0xa108,0xa108
-.value	0xa108,0xa108
-.value	0xa108,0xa108
-.value	0xa108,0xa108
-.value	0x6da9,0x6da9
-.value	0x6da9,0x6da9
-.value	0x6da9,0x6da9
-.value	0x6da9,0x6da9
-.value	0x04b2,0x04b2
-.value	0x04b2,0x04b2
-.value	0x04b2,0x04b2
-.value	0x04b2,0x04b2
-.value	0x04b2,0x04b2
-.value	0x04b2,0x04b2
-.value	0x04b2,0x04b2
-.value	0x04b2,0x04b2
-.value	0xfab2,0xfab2
-.value	0xfab2,0xfab2
-.value	0xfab2,0xfab2
-.value	0xfab2,0xfab2
-.value	0xfab2,0xfab2
-.value	0xfab2,0xfab2
-.value	0xfab2,0xfab2
-.value	0xfab2,0xfab2
-.value	0x093f,0x093f
-.value	0x093f,0x093f
-.value	0x093f,0x093f
-.value	0x093f,0x093f
-.value	0x093f,0x093f
-.value	0x093f,0x093f
-.value	0x093f,0x093f
-.value	0x093f,0x093f
-.value	0xd63f,0xd63f
-.value	0xd63f,0xd63f
-.value	0xd63f,0xd63f
-.value	0xd63f,0xd63f
-.value	0xd63f,0xd63f
-.value	0xd63f,0xd63f
-.value	0xd63f,0xd63f
-.value	0xd63f,0xd63f
-.value	0x0be2,0x0be2
-.value	0x0be2,0x0be2
-.value	0x0be2,0x0be2
-.value	0x0be2,0x0be2
-.value	0x0be2,0x0be2
-.value	0x0be2,0x0be2
-.value	0x0be2,0x0be2
-.value	0x0be2,0x0be2
-.value	0x91e2,0x91e2
-.value	0x91e2,0x91e2
-.value	0x91e2,0x91e2
-.value	0x91e2,0x91e2
-.value	0x91e2,0x91e2
-.value	0x91e2,0x91e2
-.value	0x91e2,0x91e2
-.value	0x91e2,0x91e2
-.value	0x05ed,0x05ed
-.value	0x05ed,0x05ed
-.value	0x05ed,0x05ed
-.value	0x05ed,0x05ed
-.value	0x05ed,0x05ed
-.value	0x05ed,0x05ed
-.value	0x05ed,0x05ed
-.value	0x05ed,0x05ed
-.value	0xfced,0xfced
-.value	0xfced,0xfced
-.value	0xfced,0xfced
-.value	0xfced,0xfced
-.value	0xfced,0xfced
-.value	0xfced,0xfced
-.value	0xfced,0xfced
-.value	0xfced,0xfced
-.value	0x004b,0x004b
-.value	0x0bb8,0x0bb8
-.value	0x009c,0x009c
-.value	0x0b5f,0x0b5f
-.value	0x0ba4,0x0ba4
-.value	0x0a7d,0x0a7d
-.value	0x0368,0x0368
-.value	0x0636,0x0636
-.value	0x314b,0x314b
-.value	0xb3b8,0xb3b8
-.value	0x149c,0x149c
-.value	0x385f,0x385f
-.value	0xb7a4,0xb7a4
-.value	0xb17d,0xb17d
-.value	0xbb68,0xbb68
-.value	0x4836,0x4836
-.value	0x08a2,0x08a2
-.value	0x0736,0x0736
-.value	0x025a,0x025a
-.value	0x0309,0x0309
-.value	0x0093,0x0093
-.value	0x09f7,0x09f7
-.value	0x087a,0x087a
-.value	0x00f6,0x00f6
-.value	0xcea2,0xcea2
-.value	0x4936,0x4936
-.value	0x705a,0x705a
-.value	0x8e09,0x8e09
-.value	0x8993,0x8993
-.value	0x7ef7,0x7ef7
-.value	0xd67a,0xd67a
-.value	0x82f6,0x82f6
-.value	0x0744,0x0744
-.value	0x0744,0x0744
-.value	0x0c83,0x0c83
-.value	0x0c83,0x0c83
-.value	0x048a,0x048a
-.value	0x048a,0x048a
-.value	0x0652,0x0652
-.value	0x0652,0x0652
-.value	0x9344,0x9344
-.value	0x9344,0x9344
-.value	0x6583,0x6583
-.value	0x6583,0x6583
-.value	0x028a,0x028a
-.value	0x028a,0x028a
-.value	0xdc52,0xdc52
-.value	0xdc52,0xdc52
-.value	0x029a,0x029a
-.value	0x029a,0x029a
-.value	0x0140,0x0140
-.value	0x0140,0x0140
-.value	0x0008,0x0008
-.value	0x0008,0x0008
-.value	0x0afd,0x0afd
-.value	0x0afd,0x0afd
-.value	0x309a,0x309a
-.value	0x309a,0x309a
-.value	0xc140,0xc140
-.value	0xc140,0xc140
-.value	0x9808,0x9808
-.value	0x9808,0x9808
-.value	0x31fd,0x31fd
-.value	0x31fd,0x31fd
-.value	0x0082,0x0082
-.value	0x0082,0x0082
-.value	0x0082,0x0082
-.value	0x0082,0x0082
-.value	0x0642,0x0642
-.value	0x0642,0x0642
-.value	0x0642,0x0642
-.value	0x0642,0x0642
-.value	0x6682,0x6682
-.value	0x6682,0x6682
-.value	0x6682,0x6682
-.value	0x6682,0x6682
-.value	0xac42,0xac42
-.value	0xac42,0xac42
-.value	0xac42,0xac42
-.value	0xac42,0xac42
-.value	0x074f,0x074f
-.value	0x074f,0x074f
-.value	0x074f,0x074f
-.value	0x074f,0x074f
-.value	0x033d,0x033d
-.value	0x033d,0x033d
-.value	0x033d,0x033d
-.value	0x033d,0x033d
-.value	0x044f,0x044f
-.value	0x044f,0x044f
-.value	0x044f,0x044f
-.value	0x044f,0x044f
-.value	0xea3d,0xea3d
-.value	0xea3d,0xea3d
-.value	0xea3d,0xea3d
-.value	0xea3d,0xea3d
-.value	0x0c4b,0x0c4b
-.value	0x0c4b,0x0c4b
-.value	0x0c4b,0x0c4b
-.value	0x0c4b,0x0c4b
-.value	0x0c4b,0x0c4b
-.value	0x0c4b,0x0c4b
-.value	0x0c4b,0x0c4b
-.value	0x0c4b,0x0c4b
-.value	0x3d4b,0x3d4b
-.value	0x3d4b,0x3d4b
-.value	0x3d4b,0x3d4b
-.value	0x3d4b,0x3d4b
-.value	0x3d4b,0x3d4b
-.value	0x3d4b,0x3d4b
-.value	0x3d4b,0x3d4b
-.value	0x3d4b,0x3d4b
-.value	0x06d8,0x06d8
-.value	0x06d8,0x06d8
-.value	0x06d8,0x06d8
-.value	0x06d8,0x06d8
-.value	0x06d8,0x06d8
-.value	0x06d8,0x06d8
-.value	0x06d8,0x06d8
-.value	0x06d8,0x06d8
-.value	0x0ed8,0x0ed8
-.value	0x0ed8,0x0ed8
-.value	0x0ed8,0x0ed8
-.value	0x0ed8,0x0ed8
-.value	0x0ed8,0x0ed8
-.value	0x0ed8,0x0ed8
-.value	0x0ed8,0x0ed8
-.value	0x0ed8,0x0ed8
-.value	0x0773,0x0773
-.value	0x0773,0x0773
-.value	0x0773,0x0773
-.value	0x0773,0x0773
-.value	0x0773,0x0773
-.value	0x0773,0x0773
-.value	0x0773,0x0773
-.value	0x0773,0x0773
-.value	0x3073,0x3073
-.value	0x3073,0x3073
-.value	0x3073,0x3073
-.value	0x3073,0x3073
-.value	0x3073,0x3073
-.value	0x3073,0x3073
-.value	0x3073,0x3073
-.value	0x3073,0x3073
-.value	0x068c,0x068c
-.value	0x01cc,0x01cc
-.value	0x06db,0x06db
-.value	0x0123,0x0123
-.value	0x00eb,0x00eb
-.value	0x0ab6,0x0ab6
-.value	0x0c50,0x0c50
-.value	0x0b5b,0x0b5b
-.value	0xea8c,0xea8c
-.value	0xa5cc,0xa5cc
-.value	0xe7db,0xe7db
-.value	0x3a23,0x3a23
-.value	0x11eb,0x11eb
-.value	0xccb6,0xccb6
-.value	0xfc50,0xfc50
-.value	0x6c5b,0x6c5b
-.value	0x0c98,0x0c98
-.value	0x099a,0x099a
-.value	0x06f3,0x06f3
-.value	0x04e3,0x04e3
-.value	0x09b6,0x09b6
-.value	0x0b53,0x0b53
-.value	0x0ad6,0x0ad6
-.value	0x044f,0x044f
-.value	0x5498,0x5498
-.value	0x379a,0x379a
-.value	0xaff3,0xaff3
-.value	0x7de3,0x7de3
-.value	0xcbb6,0xcbb6
-.value	0xd453,0xd453
-.value	0x2cd6,0x2cd6
-.value	0x014f,0x014f
-.value	0x0608,0x0608
-.value	0x0608,0x0608
-.value	0x011a,0x011a
-.value	0x011a,0x011a
-.value	0x072e,0x072e
-.value	0x072e,0x072e
-.value	0x050d,0x050d
-.value	0x050d,0x050d
-.value	0x9e08,0x9e08
-.value	0x9e08,0x9e08
-.value	0xaf1a,0xaf1a
-.value	0xaf1a,0xaf1a
-.value	0xb12e,0xb12e
-.value	0xb12e,0xb12e
-.value	0x5c0d,0x5c0d
-.value	0x5c0d,0x5c0d
-.value	0x090a,0x090a
-.value	0x090a,0x090a
-.value	0x0228,0x0228
-.value	0x0228,0x0228
-.value	0x0a75,0x0a75
-.value	0x0a75,0x0a75
-.value	0x083a,0x083a
-.value	0x083a,0x083a
-.value	0x870a,0x870a
-.value	0x870a,0x870a
-.value	0xfa28,0xfa28
-.value	0xfa28,0xfa28
-.value	0x1975,0x1975
-.value	0x1975,0x1975
-.value	0x163a,0x163a
-.value	0x163a,0x163a
-.value	0x0b82,0x0b82
-.value	0x0b82,0x0b82
-.value	0x0b82,0x0b82
-.value	0x0b82,0x0b82
-.value	0x0bf9,0x0bf9
-.value	0x0bf9,0x0bf9
-.value	0x0bf9,0x0bf9
-.value	0x0bf9,0x0bf9
-.value	0x7182,0x7182
-.value	0x7182,0x7182
-.value	0x7182,0x7182
-.value	0x7182,0x7182
-.value	0x66f9,0x66f9
-.value	0x66f9,0x66f9
-.value	0x66f9,0x66f9
-.value	0x66f9,0x66f9
-.value	0x052d,0x052d
-.value	0x052d,0x052d
-.value	0x052d,0x052d
-.value	0x052d,0x052d
-.value	0x0ac4,0x0ac4
-.value	0x0ac4,0x0ac4
-.value	0x0ac4,0x0ac4
-.value	0x0ac4,0x0ac4
-.value	0xbc2d,0xbc2d
-.value	0xbc2d,0xbc2d
-.value	0xbc2d,0xbc2d
-.value	0xbc2d,0xbc2d
-.value	0x16c4,0x16c4
-.value	0x16c4,0x16c4
-.value	0x16c4,0x16c4
-.value	0x16c4,0x16c4
-.value	0x0a93,0x0a93
-.value	0x0a93,0x0a93
-.value	0x0a93,0x0a93
-.value	0x0a93,0x0a93
-.value	0x0a93,0x0a93
-.value	0x0a93,0x0a93
-.value	0x0a93,0x0a93
-.value	0x0a93,0x0a93
-.value	0x9393,0x9393
-.value	0x9393,0x9393
-.value	0x9393,0x9393
-.value	0x9393,0x9393
-.value	0x9393,0x9393
-.value	0x9393,0x9393
-.value	0x9393,0x9393
-.value	0x9393,0x9393
-.value	0x00ab,0x00ab
-.value	0x00ab,0x00ab
-.value	0x00ab,0x00ab
-.value	0x00ab,0x00ab
-.value	0x00ab,0x00ab
-.value	0x00ab,0x00ab
-.value	0x00ab,0x00ab
-.value	0x00ab,0x00ab
-.value	0x51ab,0x51ab
-.value	0x51ab,0x51ab
-.value	0x51ab,0x51ab
-.value	0x51ab,0x51ab
-.value	0x51ab,0x51ab
-.value	0x51ab,0x51ab
-.value	0x51ab,0x51ab
-.value	0x51ab,0x51ab
-.value	0x072c,0x072c
-.value	0x072c,0x072c
-.value	0x072c,0x072c
-.value	0x072c,0x072c
-.value	0x072c,0x072c
-.value	0x072c,0x072c
-.value	0x072c,0x072c
-.value	0x072c,0x072c
-.value	0xcb2c,0xcb2c
-.value	0xcb2c,0xcb2c
-.value	0xcb2c,0xcb2c
-.value	0xcb2c,0xcb2c
-.value	0xcb2c,0xcb2c
-.value	0xcb2c,0xcb2c
-.value	0xcb2c,0xcb2c
-.value	0xcb2c,0xcb2c
-.value	0x0167,0x0167
-.value	0x0167,0x0167
-.value	0x0167,0x0167
-.value	0x0167,0x0167
-.value	0x0167,0x0167
-.value	0x0167,0x0167
-.value	0x0167,0x0167
-.value	0x0167,0x0167
-.value	0xc667,0xc667
-.value	0xc667,0xc667
-.value	0xc667,0xc667
-.value	0xc667,0xc667
-.value	0xc667,0xc667
-.value	0xc667,0xc667
-.value	0xc667,0xc667
-.value	0xc667,0xc667
-.value	0x02f6,0x02f6
-.value	0x02f6,0x02f6
-.value	0x02f6,0x02f6
-.value	0x02f6,0x02f6
-.value	0x02f6,0x02f6
-.value	0x02f6,0x02f6
-.value	0x02f6,0x02f6
-.value	0x02f6,0x02f6
-.value	0x84f6,0x84f6
-.value	0x84f6,0x84f6
-.value	0x84f6,0x84f6
-.value	0x84f6,0x84f6
-.value	0x84f6,0x84f6
-.value	0x84f6,0x84f6
-.value	0x84f6,0x84f6
-.value	0x84f6,0x84f6
-.value	0x05a1,0x05a1
-.value	0x05a1,0x05a1
-.value	0x05a1,0x05a1
-.value	0x05a1,0x05a1
-.value	0x05a1,0x05a1
-.value	0x05a1,0x05a1
-.value	0x05a1,0x05a1
-.value	0x05a1,0x05a1
-.value	0xd8a1,0xd8a1
-.value	0xd8a1,0xd8a1
-.value	0xd8a1,0xd8a1
-.value	0xd8a1,0xd8a1
-.value	0xd8a1,0xd8a1
-.value	0xd8a1,0xd8a1
-.value	0xd8a1,0xd8a1
-.value	0xd8a1,0xd8a1
+.short	0x06a5,0x06a5,0x05b4,0x05b4,0x070f,0x070f,0x0943,0x0943
+.short	0x0922,0x0922,0x0134,0x0134,0x091d,0x091d,0x006c,0x006c
+.short	0xa5a5,0xa5a5,0xe1b4,0xe1b4,0x440f,0x440f,0xa243,0xa243
+.short	0x4f22,0x4f22,0x5d34,0x5d34,0x901d,0x901d,0x846c,0x846c
+.short	0x0b23,0x0b23,0x0356,0x0356,0x0366,0x0366,0x05e6,0x05e6
+.short	0x09e7,0x09e7,0x05fa,0x05fa,0x04fe,0x04fe,0x04a1,0x04a1
+.short	0x4423,0x4423,0xa556,0xa556,0xd566,0xd566,0x57e6,0x57e6
+.short	0x4ee7,0x4ee7,0x53fa,0x53fa,0x1efe,0x1efe,0xd7a1,0xd7a1
+.short	0x04fb,0x04fb,0x04fb,0x04fb,0x0a5c,0x0a5c,0x0a5c,0x0a5c
+.short	0x0429,0x0429,0x0429,0x0429,0x0b41,0x0b41,0x0b41,0x0b41
+.short	0x45fb,0x45fb,0x45fb,0x45fb,0x5e5c,0x5e5c,0x5e5c,0x5e5c
+.short	0xef29,0xef29,0xef29,0xef29,0xbe41,0xbe41,0xbe41,0xbe41
+.short	0x02d5,0x02d5,0x02d5,0x02d5,0x05e4,0x05e4,0x05e4,0x05e4
+.short	0x0940,0x0940,0x0940,0x0940,0x018e,0x018e,0x018e,0x018e
+.short	0x31d5,0x31d5,0x31d5,0x31d5,0x71e4,0x71e4,0x71e4,0x71e4
+.short	0xc940,0xc940,0xc940,0xc940,0xcb8e,0xcb8e,0xcb8e,0xcb8e
+.short	0x0623,0x0623,0x0623,0x0623,0x0623,0x0623,0x0623,0x0623
+.short	0x00cd,0x00cd,0x00cd,0x00cd,0x00cd,0x00cd,0x00cd,0x00cd
+.short	0x3f23,0x3f23,0x3f23,0x3f23,0x3f23,0x3f23,0x3f23,0x3f23
+.short	0x97cd,0x97cd,0x97cd,0x97cd,0x97cd,0x97cd,0x97cd,0x97cd
+.short	0x0b66,0x0b66,0x0b66,0x0b66,0x0b66,0x0b66,0x0b66,0x0b66
+.short	0x0606,0x0606,0x0606,0x0606,0x0606,0x0606,0x0606,0x0606
+.short	0xdd66,0xdd66,0xdd66,0xdd66,0xdd66,0xdd66,0xdd66,0xdd66
+.short	0xb806,0xb806,0xb806,0xb806,0xb806,0xb806,0xb806,0xb806
+.short	0x0745,0x0745,0x0745,0x0745,0x0745,0x0745,0x0745,0x0745
+.short	0x0745,0x0745,0x0745,0x0745,0x0745,0x0745,0x0745,0x0745
+.short	0x8645,0x8645,0x8645,0x8645,0x8645,0x8645,0x8645,0x8645
+.short	0x8645,0x8645,0x8645,0x8645,0x8645,0x8645,0x8645,0x8645
+.short	0x05c2,0x05c2,0x05c2,0x05c2,0x05c2,0x05c2,0x05c2,0x05c2
+.short	0x05c2,0x05c2,0x05c2,0x05c2,0x05c2,0x05c2,0x05c2,0x05c2
+.short	0x2bc2,0x2bc2,0x2bc2,0x2bc2,0x2bc2,0x2bc2,0x2bc2,0x2bc2
+.short	0x2bc2,0x2bc2,0x2bc2,0x2bc2,0x2bc2,0x2bc2,0x2bc2,0x2bc2
+.short	0x0c37,0x0c37,0x0c37,0x0c37,0x0c37,0x0c37,0x0c37,0x0c37
+.short	0x0c37,0x0c37,0x0c37,0x0c37,0x0c37,0x0c37,0x0c37,0x0c37
+.short	0x4137,0x4137,0x4137,0x4137,0x4137,0x4137,0x4137,0x4137
+.short	0x4137,0x4137,0x4137,0x4137,0x4137,0x4137,0x4137,0x4137
+.short	0x067b,0x067b,0x0c25,0x0c25,0x04a3,0x04a3,0x036a,0x036a
+.short	0x0537,0x0537,0x0088,0x0088,0x083f,0x083f,0x04bf,0x04bf
+.short	0xc77b,0xc77b,0x2b25,0x2b25,0xbda3,0xbda3,0xa16a,0xa16a
+.short	0x3a37,0x3a37,0x1888,0x1888,0xd53f,0xd53f,0x51bf,0x51bf
+.short	0x0b81,0x0b81,0x0505,0x0505,0x05b9,0x05b9,0x07d7,0x07d7
+.short	0x0a9f,0x0a9f,0x08b8,0x08b8,0x0aa6,0x0aa6,0x09d0,0x09d0
+.short	0x7e81,0x7e81,0xc405,0xc405,0xa0b9,0xa0b9,0x1cd7,0x1cd7
+.short	0xf79f,0xf79f,0xb0b8,0xb0b8,0x9ca6,0x9ca6,0x79d0,0x79d0
+.short	0x03b7,0x03b7,0x03b7,0x03b7,0x00f7,0x00f7,0x00f7,0x00f7
+.short	0x058d,0x058d,0x058d,0x058d,0x0c96,0x0c96,0x0c96,0x0c96
+.short	0xb8b7,0xb8b7,0xb8b7,0xb8b7,0x75f7,0x75f7,0x75f7,0x75f7
+.short	0xdc8d,0xdc8d,0xdc8d,0xdc8d,0x6e96,0x6e96,0x6e96,0x6e96
+.short	0x09c3,0x09c3,0x09c3,0x09c3,0x010f,0x010f,0x010f,0x010f
+.short	0x005a,0x005a,0x005a,0x005a,0x0355,0x0355,0x0355,0x0355
+.short	0x22c3,0x22c3,0x22c3,0x22c3,0x3e0f,0x3e0f,0x3e0f,0x3e0f
+.short	0x6e5a,0x6e5a,0x6e5a,0x6e5a,0xb255,0xb255,0xb255,0xb255
+.short	0x0aa1,0x0aa1,0x0aa1,0x0aa1,0x0aa1,0x0aa1,0x0aa1,0x0aa1
+.short	0x0a25,0x0a25,0x0a25,0x0a25,0x0a25,0x0a25,0x0a25,0x0a25
+.short	0xdda1,0xdda1,0xdda1,0xdda1,0xdda1,0xdda1,0xdda1,0xdda1
+.short	0x2925,0x2925,0x2925,0x2925,0x2925,0x2925,0x2925,0x2925
+.short	0x0908,0x0908,0x0908,0x0908,0x0908,0x0908,0x0908,0x0908
+.short	0x02a9,0x02a9,0x02a9,0x02a9,0x02a9,0x02a9,0x02a9,0x02a9
+.short	0xa108,0xa108,0xa108,0xa108,0xa108,0xa108,0xa108,0xa108
+.short	0x6da9,0x6da9,0x6da9,0x6da9,0x6da9,0x6da9,0x6da9,0x6da9
+.short	0x04b2,0x04b2,0x04b2,0x04b2,0x04b2,0x04b2,0x04b2,0x04b2
+.short	0x04b2,0x04b2,0x04b2,0x04b2,0x04b2,0x04b2,0x04b2,0x04b2
+.short	0xfab2,0xfab2,0xfab2,0xfab2,0xfab2,0xfab2,0xfab2,0xfab2
+.short	0xfab2,0xfab2,0xfab2,0xfab2,0xfab2,0xfab2,0xfab2,0xfab2
+.short	0x093f,0x093f,0x093f,0x093f,0x093f,0x093f,0x093f,0x093f
+.short	0x093f,0x093f,0x093f,0x093f,0x093f,0x093f,0x093f,0x093f
+.short	0xd63f,0xd63f,0xd63f,0xd63f,0xd63f,0xd63f,0xd63f,0xd63f
+.short	0xd63f,0xd63f,0xd63f,0xd63f,0xd63f,0xd63f,0xd63f,0xd63f
+.short	0x0be2,0x0be2,0x0be2,0x0be2,0x0be2,0x0be2,0x0be2,0x0be2
+.short	0x0be2,0x0be2,0x0be2,0x0be2,0x0be2,0x0be2,0x0be2,0x0be2
+.short	0x91e2,0x91e2,0x91e2,0x91e2,0x91e2,0x91e2,0x91e2,0x91e2
+.short	0x91e2,0x91e2,0x91e2,0x91e2,0x91e2,0x91e2,0x91e2,0x91e2
+.short	0x05ed,0x05ed,0x05ed,0x05ed,0x05ed,0x05ed,0x05ed,0x05ed
+.short	0x05ed,0x05ed,0x05ed,0x05ed,0x05ed,0x05ed,0x05ed,0x05ed
+.short	0xfced,0xfced,0xfced,0xfced,0xfced,0xfced,0xfced,0xfced
+.short	0xfced,0xfced,0xfced,0xfced,0xfced,0xfced,0xfced,0xfced
+.short	0x004b,0x004b,0x0bb8,0x0bb8,0x009c,0x009c,0x0b5f,0x0b5f
+.short	0x0ba4,0x0ba4,0x0a7d,0x0a7d,0x0368,0x0368,0x0636,0x0636
+.short	0x314b,0x314b,0xb3b8,0xb3b8,0x149c,0x149c,0x385f,0x385f
+.short	0xb7a4,0xb7a4,0xb17d,0xb17d,0xbb68,0xbb68,0x4836,0x4836
+.short	0x08a2,0x08a2,0x0736,0x0736,0x025a,0x025a,0x0309,0x0309
+.short	0x0093,0x0093,0x09f7,0x09f7,0x087a,0x087a,0x00f6,0x00f6
+.short	0xcea2,0xcea2,0x4936,0x4936,0x705a,0x705a,0x8e09,0x8e09
+.short	0x8993,0x8993,0x7ef7,0x7ef7,0xd67a,0xd67a,0x82f6,0x82f6
+.short	0x0744,0x0744,0x0744,0x0744,0x0c83,0x0c83,0x0c83,0x0c83
+.short	0x048a,0x048a,0x048a,0x048a,0x0652,0x0652,0x0652,0x0652
+.short	0x9344,0x9344,0x9344,0x9344,0x6583,0x6583,0x6583,0x6583
+.short	0x028a,0x028a,0x028a,0x028a,0xdc52,0xdc52,0xdc52,0xdc52
+.short	0x029a,0x029a,0x029a,0x029a,0x0140,0x0140,0x0140,0x0140
+.short	0x0008,0x0008,0x0008,0x0008,0x0afd,0x0afd,0x0afd,0x0afd
+.short	0x309a,0x309a,0x309a,0x309a,0xc140,0xc140,0xc140,0xc140
+.short	0x9808,0x9808,0x9808,0x9808,0x31fd,0x31fd,0x31fd,0x31fd
+.short	0x0082,0x0082,0x0082,0x0082,0x0082,0x0082,0x0082,0x0082
+.short	0x0642,0x0642,0x0642,0x0642,0x0642,0x0642,0x0642,0x0642
+.short	0x6682,0x6682,0x6682,0x6682,0x6682,0x6682,0x6682,0x6682
+.short	0xac42,0xac42,0xac42,0xac42,0xac42,0xac42,0xac42,0xac42
+.short	0x074f,0x074f,0x074f,0x074f,0x074f,0x074f,0x074f,0x074f
+.short	0x033d,0x033d,0x033d,0x033d,0x033d,0x033d,0x033d,0x033d
+.short	0x044f,0x044f,0x044f,0x044f,0x044f,0x044f,0x044f,0x044f
+.short	0xea3d,0xea3d,0xea3d,0xea3d,0xea3d,0xea3d,0xea3d,0xea3d
+.short	0x0c4b,0x0c4b,0x0c4b,0x0c4b,0x0c4b,0x0c4b,0x0c4b,0x0c4b
+.short	0x0c4b,0x0c4b,0x0c4b,0x0c4b,0x0c4b,0x0c4b,0x0c4b,0x0c4b
+.short	0x3d4b,0x3d4b,0x3d4b,0x3d4b,0x3d4b,0x3d4b,0x3d4b,0x3d4b
+.short	0x3d4b,0x3d4b,0x3d4b,0x3d4b,0x3d4b,0x3d4b,0x3d4b,0x3d4b
+.short	0x06d8,0x06d8,0x06d8,0x06d8,0x06d8,0x06d8,0x06d8,0x06d8
+.short	0x06d8,0x06d8,0x06d8,0x06d8,0x06d8,0x06d8,0x06d8,0x06d8
+.short	0x0ed8,0x0ed8,0x0ed8,0x0ed8,0x0ed8,0x0ed8,0x0ed8,0x0ed8
+.short	0x0ed8,0x0ed8,0x0ed8,0x0ed8,0x0ed8,0x0ed8,0x0ed8,0x0ed8
+.short	0x0773,0x0773,0x0773,0x0773,0x0773,0x0773,0x0773,0x0773
+.short	0x0773,0x0773,0x0773,0x0773,0x0773,0x0773,0x0773,0x0773
+.short	0x3073,0x3073,0x3073,0x3073,0x3073,0x3073,0x3073,0x3073
+.short	0x3073,0x3073,0x3073,0x3073,0x3073,0x3073,0x3073,0x3073
+.short	0x068c,0x068c,0x01cc,0x01cc,0x06db,0x06db,0x0123,0x0123
+.short	0x00eb,0x00eb,0x0ab6,0x0ab6,0x0c50,0x0c50,0x0b5b,0x0b5b
+.short	0xea8c,0xea8c,0xa5cc,0xa5cc,0xe7db,0xe7db,0x3a23,0x3a23
+.short	0x11eb,0x11eb,0xccb6,0xccb6,0xfc50,0xfc50,0x6c5b,0x6c5b
+.short	0x0c98,0x0c98,0x099a,0x099a,0x06f3,0x06f3,0x04e3,0x04e3
+.short	0x09b6,0x09b6,0x0b53,0x0b53,0x0ad6,0x0ad6,0x044f,0x044f
+.short	0x5498,0x5498,0x379a,0x379a,0xaff3,0xaff3,0x7de3,0x7de3
+.short	0xcbb6,0xcbb6,0xd453,0xd453,0x2cd6,0x2cd6,0x014f,0x014f
+.short	0x0608,0x0608,0x0608,0x0608,0x011a,0x011a,0x011a,0x011a
+.short	0x072e,0x072e,0x072e,0x072e,0x050d,0x050d,0x050d,0x050d
+.short	0x9e08,0x9e08,0x9e08,0x9e08,0xaf1a,0xaf1a,0xaf1a,0xaf1a
+.short	0xb12e,0xb12e,0xb12e,0xb12e,0x5c0d,0x5c0d,0x5c0d,0x5c0d
+.short	0x090a,0x090a,0x090a,0x090a,0x0228,0x0228,0x0228,0x0228
+.short	0x0a75,0x0a75,0x0a75,0x0a75,0x083a,0x083a,0x083a,0x083a
+.short	0x870a,0x870a,0x870a,0x870a,0xfa28,0xfa28,0xfa28,0xfa28
+.short	0x1975,0x1975,0x1975,0x1975,0x163a,0x163a,0x163a,0x163a
+.short	0x0b82,0x0b82,0x0b82,0x0b82,0x0b82,0x0b82,0x0b82,0x0b82
+.short	0x0bf9,0x0bf9,0x0bf9,0x0bf9,0x0bf9,0x0bf9,0x0bf9,0x0bf9
+.short	0x7182,0x7182,0x7182,0x7182,0x7182,0x7182,0x7182,0x7182
+.short	0x66f9,0x66f9,0x66f9,0x66f9,0x66f9,0x66f9,0x66f9,0x66f9
+.short	0x052d,0x052d,0x052d,0x052d,0x052d,0x052d,0x052d,0x052d
+.short	0x0ac4,0x0ac4,0x0ac4,0x0ac4,0x0ac4,0x0ac4,0x0ac4,0x0ac4
+.short	0xbc2d,0xbc2d,0xbc2d,0xbc2d,0xbc2d,0xbc2d,0xbc2d,0xbc2d
+.short	0x16c4,0x16c4,0x16c4,0x16c4,0x16c4,0x16c4,0x16c4,0x16c4
+.short	0x0a93,0x0a93,0x0a93,0x0a93,0x0a93,0x0a93,0x0a93,0x0a93
+.short	0x0a93,0x0a93,0x0a93,0x0a93,0x0a93,0x0a93,0x0a93,0x0a93
+.short	0x9393,0x9393,0x9393,0x9393,0x9393,0x9393,0x9393,0x9393
+.short	0x9393,0x9393,0x9393,0x9393,0x9393,0x9393,0x9393,0x9393
+.short	0x00ab,0x00ab,0x00ab,0x00ab,0x00ab,0x00ab,0x00ab,0x00ab
+.short	0x00ab,0x00ab,0x00ab,0x00ab,0x00ab,0x00ab,0x00ab,0x00ab
+.short	0x51ab,0x51ab,0x51ab,0x51ab,0x51ab,0x51ab,0x51ab,0x51ab
+.short	0x51ab,0x51ab,0x51ab,0x51ab,0x51ab,0x51ab,0x51ab,0x51ab
+.short	0x072c,0x072c,0x072c,0x072c,0x072c,0x072c,0x072c,0x072c
+.short	0x072c,0x072c,0x072c,0x072c,0x072c,0x072c,0x072c,0x072c
+.short	0xcb2c,0xcb2c,0xcb2c,0xcb2c,0xcb2c,0xcb2c,0xcb2c,0xcb2c
+.short	0xcb2c,0xcb2c,0xcb2c,0xcb2c,0xcb2c,0xcb2c,0xcb2c,0xcb2c
+.short	0x0167,0x0167,0x0167,0x0167,0x0167,0x0167,0x0167,0x0167
+.short	0x0167,0x0167,0x0167,0x0167,0x0167,0x0167,0x0167,0x0167
+.short	0xc667,0xc667,0xc667,0xc667,0xc667,0xc667,0xc667,0xc667
+.short	0xc667,0xc667,0xc667,0xc667,0xc667,0xc667,0xc667,0xc667
+.short	0x02f6,0x02f6,0x02f6,0x02f6,0x02f6,0x02f6,0x02f6,0x02f6
+.short	0x02f6,0x02f6,0x02f6,0x02f6,0x02f6,0x02f6,0x02f6,0x02f6
+.short	0x84f6,0x84f6,0x84f6,0x84f6,0x84f6,0x84f6,0x84f6,0x84f6
+.short	0x84f6,0x84f6,0x84f6,0x84f6,0x84f6,0x84f6,0x84f6,0x84f6
+.short	0x05a1,0x05a1,0x05a1,0x05a1,0x05a1,0x05a1,0x05a1,0x05a1
+.short	0x05a1,0x05a1,0x05a1,0x05a1,0x05a1,0x05a1,0x05a1,0x05a1
+.short	0xd8a1,0xd8a1,0xd8a1,0xd8a1,0xd8a1,0xd8a1,0xd8a1,0xd8a1
+.short	0xd8a1,0xd8a1,0xd8a1,0xd8a1,0xd8a1,0xd8a1,0xd8a1,0xd8a1
 #ifndef __APPLE__
 .text
 .globl	mlkem_keygen_avx2
@@ -11907,9 +10833,9 @@ _mlkem_csubq_avx2:
 .section	__DATA,__data
 #endif /* __APPLE__ */
 #ifndef __APPLE__
-.align	16
+.align	32
 #else
-.p2align	4
+.p2align	5
 #endif /* __APPLE__ */
 L_mlkem_rej_idx:
 .quad	0xffffffffffffffff,0xffffffffffffff00
@@ -12051,8 +10977,8 @@ L_mlkem_rej_idx:
 .p2align	5
 #endif /* __APPLE__ */
 L_mlkem_rej_q:
-.quad	0xd010d010d010d01, 0xd010d010d010d01
-.quad	0xd010d010d010d01, 0xd010d010d010d01
+.quad	0x0d010d010d010d01,0x0d010d010d010d01
+.quad	0x0d010d010d010d01,0x0d010d010d010d01
 #ifndef __APPLE__
 .data
 #else
@@ -12064,8 +10990,8 @@ L_mlkem_rej_q:
 .p2align	5
 #endif /* __APPLE__ */
 L_mlkem_rej_ones:
-.quad	0x101010101010101, 0x101010101010101
-.quad	0x101010101010101, 0x101010101010101
+.quad	0x0101010101010101,0x0101010101010101
+.quad	0x0101010101010101,0x0101010101010101
 #ifndef __APPLE__
 .data
 #else
@@ -12077,8 +11003,8 @@ L_mlkem_rej_ones:
 .p2align	5
 #endif /* __APPLE__ */
 L_mlkem_rej_mask:
-.quad	0xfff0fff0fff0fff, 0xfff0fff0fff0fff
-.quad	0xfff0fff0fff0fff, 0xfff0fff0fff0fff
+.quad	0x0fff0fff0fff0fff,0x0fff0fff0fff0fff
+.quad	0x0fff0fff0fff0fff,0x0fff0fff0fff0fff
 #ifndef __APPLE__
 .data
 #else
@@ -12090,8 +11016,8 @@ L_mlkem_rej_mask:
 .p2align	5
 #endif /* __APPLE__ */
 L_mlkem_rej_shuffle:
-.quad	0x504040302010100, 0xb0a0a0908070706
-.quad	0x908080706050504, 0xf0e0e0d0c0b0b0a
+.quad	0x0504040302010100,0x0b0a0a0908070706
+.quad	0x0908080706050504,0x0f0e0e0d0c0b0b0a
 #ifndef __APPLE__
 .text
 .globl	mlkem_rej_uniform_n_avx2
@@ -13040,8 +11966,8 @@ L_mlkem_rej_uniform_avx2_done_64:
 .p2align	5
 #endif /* __APPLE__ */
 L_mlkem_mask_249:
-.quad	0x24924900249249, 0x24924900249249
-.quad	0x24924900249249, 0x24924900249249
+.quad	0x0024924900249249,0x0024924900249249
+.quad	0x0024924900249249,0x0024924900249249
 #ifndef __APPLE__
 .data
 #else
@@ -13053,8 +11979,8 @@ L_mlkem_mask_249:
 .p2align	5
 #endif /* __APPLE__ */
 L_mlkem_mask_6db:
-.quad	0x6db6db006db6db, 0x6db6db006db6db
-.quad	0x6db6db006db6db, 0x6db6db006db6db
+.quad	0x006db6db006db6db,0x006db6db006db6db
+.quad	0x006db6db006db6db,0x006db6db006db6db
 #ifndef __APPLE__
 .data
 #else
@@ -13066,8 +11992,8 @@ L_mlkem_mask_6db:
 .p2align	5
 #endif /* __APPLE__ */
 L_mlkem_mask_07:
-.quad	0x700000007, 0x700000007
-.quad	0x700000007, 0x700000007
+.quad	0x0000000700000007,0x0000000700000007
+.quad	0x0000000700000007,0x0000000700000007
 #ifndef __APPLE__
 .data
 #else
@@ -13079,8 +12005,8 @@ L_mlkem_mask_07:
 .p2align	5
 #endif /* __APPLE__ */
 L_mlkem_mask_70:
-.quad	0x7000000070000, 0x7000000070000
-.quad	0x7000000070000, 0x7000000070000
+.quad	0x0007000000070000,0x0007000000070000
+.quad	0x0007000000070000,0x0007000000070000
 #ifndef __APPLE__
 .data
 #else
@@ -13092,8 +12018,8 @@ L_mlkem_mask_70:
 .p2align	5
 #endif /* __APPLE__ */
 L_mlkem_mask_3:
-.quad	0x3000300030003, 0x3000300030003
-.quad	0x3000300030003, 0x3000300030003
+.quad	0x0003000300030003,0x0003000300030003
+.quad	0x0003000300030003,0x0003000300030003
 #ifndef __APPLE__
 .data
 #else
@@ -13105,8 +12031,8 @@ L_mlkem_mask_3:
 .p2align	5
 #endif /* __APPLE__ */
 L_mlkem_shuff:
-.quad	0xff050403ff020100, 0xff0b0a09ff080706
-.quad	0xff090807ff060504, 0xff0f0e0dff0c0b0a
+.quad	0xff050403ff020100,0xff0b0a09ff080706
+.quad	0xff090807ff060504,0xff0f0e0dff0c0b0a
 #ifndef __APPLE__
 .text
 .globl	mlkem_cbd_eta3_avx2
@@ -13381,8 +12307,8 @@ _mlkem_cbd_eta3_avx2:
 .p2align	5
 #endif /* __APPLE__ */
 L_mlkem_mask_55:
-.quad	0x5555555555555555, 0x5555555555555555
-.quad	0x5555555555555555, 0x5555555555555555
+.quad	0x5555555555555555,0x5555555555555555
+.quad	0x5555555555555555,0x5555555555555555
 #ifndef __APPLE__
 .data
 #else
@@ -13394,8 +12320,8 @@ L_mlkem_mask_55:
 .p2align	5
 #endif /* __APPLE__ */
 L_mlkem_mask_33:
-.quad	0x3333333333333333, 0x3333333333333333
-.quad	0x3333333333333333, 0x3333333333333333
+.quad	0x3333333333333333,0x3333333333333333
+.quad	0x3333333333333333,0x3333333333333333
 #ifndef __APPLE__
 .data
 #else
@@ -13407,8 +12333,8 @@ L_mlkem_mask_33:
 .p2align	5
 #endif /* __APPLE__ */
 L_mlkem_mask_03:
-.quad	0x303030303030303, 0x303030303030303
-.quad	0x303030303030303, 0x303030303030303
+.quad	0x0303030303030303,0x0303030303030303
+.quad	0x0303030303030303,0x0303030303030303
 #ifndef __APPLE__
 .data
 #else
@@ -13420,8 +12346,8 @@ L_mlkem_mask_03:
 .p2align	5
 #endif /* __APPLE__ */
 L_mlkem_mask_0f:
-.quad	0xf0f0f0f0f0f0f0f, 0xf0f0f0f0f0f0f0f
-.quad	0xf0f0f0f0f0f0f0f, 0xf0f0f0f0f0f0f0f
+.quad	0x0f0f0f0f0f0f0f0f,0x0f0f0f0f0f0f0f0f
+.quad	0x0f0f0f0f0f0f0f0f,0x0f0f0f0f0f0f0f0f
 #ifndef __APPLE__
 .text
 .globl	mlkem_cbd_eta2_avx2
@@ -13562,14 +12488,8 @@ _mlkem_cbd_eta2_avx2:
 .p2align	4
 #endif /* __APPLE__ */
 L_mlkem_compress_10_avx2_mask:
-.value	0x03ff,0x03ff
-.value	0x03ff,0x03ff
-.value	0x03ff,0x03ff
-.value	0x03ff,0x03ff
-.value	0x03ff,0x03ff
-.value	0x03ff,0x03ff
-.value	0x03ff,0x03ff
-.value	0x03ff,0x03ff
+.short	0x03ff,0x03ff,0x03ff,0x03ff,0x03ff,0x03ff,0x03ff,0x03ff
+.short	0x03ff,0x03ff,0x03ff,0x03ff,0x03ff,0x03ff,0x03ff,0x03ff
 #ifndef __APPLE__
 .data
 #else
@@ -13581,8 +12501,8 @@ L_mlkem_compress_10_avx2_mask:
 .p2align	5
 #endif /* __APPLE__ */
 L_mlkem_compress_10_avx2_shift:
-.quad	0x400000104000001, 0x400000104000001
-.quad	0x400000104000001, 0x400000104000001
+.quad	0x0400000104000001,0x0400000104000001
+.quad	0x0400000104000001,0x0400000104000001
 #ifndef __APPLE__
 .data
 #else
@@ -13594,8 +12514,8 @@ L_mlkem_compress_10_avx2_shift:
 .p2align	5
 #endif /* __APPLE__ */
 L_mlkem_compress_10_avx2_shlv:
-.quad	0xc, 0xc
-.quad	0xc, 0xc
+.quad	0x000000000000000c,0x000000000000000c
+.quad	0x000000000000000c,0x000000000000000c
 #ifndef __APPLE__
 .data
 #else
@@ -13607,14 +12527,10 @@ L_mlkem_compress_10_avx2_shlv:
 .p2align	4
 #endif /* __APPLE__ */
 L_mlkem_compress_10_avx2_shuf:
-.value	0x100,0x302
-.value	0x804,0xa09
-.value	0xc0b,0xffff
-.value	0xffff,0xffff
-.value	0xa09,0xc0b
-.value	0xffff,0xffff
-.value	0xffff,0x100
-.value	0x302,0x804
+.byte	0x00,0x01,0x02,0x03,0x04,0x08,0x09,0x0a
+.byte	0x0b,0x0c,0xff,0xff,0xff,0xff,0xff,0xff
+.byte	0x09,0x0a,0x0b,0x0c,0xff,0xff,0xff,0xff
+.byte	0xff,0xff,0x00,0x01,0x02,0x03,0x04,0x08
 #ifndef __APPLE__
 .data
 #else
@@ -13626,14 +12542,8 @@ L_mlkem_compress_10_avx2_shuf:
 .p2align	4
 #endif /* __APPLE__ */
 L_mlkem_compress_10_avx2_v:
-.value	0x4ebf,0x4ebf
-.value	0x4ebf,0x4ebf
-.value	0x4ebf,0x4ebf
-.value	0x4ebf,0x4ebf
-.value	0x4ebf,0x4ebf
-.value	0x4ebf,0x4ebf
-.value	0x4ebf,0x4ebf
-.value	0x4ebf,0x4ebf
+.short	0x4ebf,0x4ebf,0x4ebf,0x4ebf,0x4ebf,0x4ebf,0x4ebf,0x4ebf
+.short	0x4ebf,0x4ebf,0x4ebf,0x4ebf,0x4ebf,0x4ebf,0x4ebf,0x4ebf
 #ifndef __APPLE__
 .data
 #else
@@ -13645,14 +12555,8 @@ L_mlkem_compress_10_avx2_v:
 .p2align	4
 #endif /* __APPLE__ */
 L_mlkem_compress_10_avx2_offset:
-.value	0x000f,0x000f
-.value	0x000f,0x000f
-.value	0x000f,0x000f
-.value	0x000f,0x000f
-.value	0x000f,0x000f
-.value	0x000f,0x000f
-.value	0x000f,0x000f
-.value	0x000f,0x000f
+.short	0x000f,0x000f,0x000f,0x000f,0x000f,0x000f,0x000f,0x000f
+.short	0x000f,0x000f,0x000f,0x000f,0x000f,0x000f,0x000f,0x000f
 #ifndef __APPLE__
 .data
 #else
@@ -13664,14 +12568,8 @@ L_mlkem_compress_10_avx2_offset:
 .p2align	4
 #endif /* __APPLE__ */
 L_mlkem_compress_10_avx2_shift12:
-.value	0x1000,0x1000
-.value	0x1000,0x1000
-.value	0x1000,0x1000
-.value	0x1000,0x1000
-.value	0x1000,0x1000
-.value	0x1000,0x1000
-.value	0x1000,0x1000
-.value	0x1000,0x1000
+.short	0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000
+.short	0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000
 #ifndef __APPLE__
 .text
 .globl	mlkem_compress_10_avx2
@@ -14012,6 +12910,11 @@ L_mlkem_compress_10_avx2_start:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_mlkem_decompress_10_avx2_mask:
 .long	0x7fe01ff8,0x7fe01ff8,0x7fe01ff8,0x7fe01ff8
 .long	0x7fe01ff8,0x7fe01ff8,0x7fe01ff8,0x7fe01ff8
@@ -14026,13 +12929,18 @@ L_mlkem_decompress_10_avx2_mask:
 .p2align	5
 #endif /* __APPLE__ */
 L_mlkem_decompress_10_avx2_sllv:
-.quad	0x4, 0x4
-.quad	0x4, 0x4
+.quad	0x0000000000000004,0x0000000000000004
+.quad	0x0000000000000004,0x0000000000000004
 #ifndef __APPLE__
 .data
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_mlkem_decompress_10_avx2_q:
 .long	0x0d013404,0x0d013404,0x0d013404,0x0d013404
 .long	0x0d013404,0x0d013404,0x0d013404,0x0d013404
@@ -14047,14 +12955,10 @@ L_mlkem_decompress_10_avx2_q:
 .p2align	4
 #endif /* __APPLE__ */
 L_mlkem_decompress_10_avx2_shuf:
-.value	0x100,0x201
-.value	0x302,0x403
-.value	0x605,0x706
-.value	0x807,0x908
-.value	0x302,0x403
-.value	0x504,0x605
-.value	0x807,0x908
-.value	0xa09,0xb0a
+.byte	0x00,0x01,0x01,0x02,0x02,0x03,0x03,0x04
+.byte	0x05,0x06,0x06,0x07,0x07,0x08,0x08,0x09
+.byte	0x02,0x03,0x03,0x04,0x04,0x05,0x05,0x06
+.byte	0x07,0x08,0x08,0x09,0x09,0x0a,0x0a,0x0b
 #ifndef __APPLE__
 .text
 .globl	mlkem_decompress_10_avx2
@@ -14204,14 +13108,8 @@ L_mlkem_decompress_10_avx2_start:
 .p2align	4
 #endif /* __APPLE__ */
 L_mlkem_compress_11_avx2_v:
-.value	0x4ebf,0x4ebf
-.value	0x4ebf,0x4ebf
-.value	0x4ebf,0x4ebf
-.value	0x4ebf,0x4ebf
-.value	0x4ebf,0x4ebf
-.value	0x4ebf,0x4ebf
-.value	0x4ebf,0x4ebf
-.value	0x4ebf,0x4ebf
+.short	0x4ebf,0x4ebf,0x4ebf,0x4ebf,0x4ebf,0x4ebf,0x4ebf,0x4ebf
+.short	0x4ebf,0x4ebf,0x4ebf,0x4ebf,0x4ebf,0x4ebf,0x4ebf,0x4ebf
 #ifndef __APPLE__
 .data
 #else
@@ -14223,14 +13121,8 @@ L_mlkem_compress_11_avx2_v:
 .p2align	4
 #endif /* __APPLE__ */
 L_mlkem_compress_11_avx2_off:
-.value	0x0024,0x0024
-.value	0x0024,0x0024
-.value	0x0024,0x0024
-.value	0x0024,0x0024
-.value	0x0024,0x0024
-.value	0x0024,0x0024
-.value	0x0024,0x0024
-.value	0x0024,0x0024
+.short	0x0024,0x0024,0x0024,0x0024,0x0024,0x0024,0x0024,0x0024
+.short	0x0024,0x0024,0x0024,0x0024,0x0024,0x0024,0x0024,0x0024
 #ifndef __APPLE__
 .data
 #else
@@ -14242,14 +13134,8 @@ L_mlkem_compress_11_avx2_off:
 .p2align	4
 #endif /* __APPLE__ */
 L_mlkem_compress_11_avx2_shift13:
-.value	0x2000,0x2000
-.value	0x2000,0x2000
-.value	0x2000,0x2000
-.value	0x2000,0x2000
-.value	0x2000,0x2000
-.value	0x2000,0x2000
-.value	0x2000,0x2000
-.value	0x2000,0x2000
+.short	0x2000,0x2000,0x2000,0x2000,0x2000,0x2000,0x2000,0x2000
+.short	0x2000,0x2000,0x2000,0x2000,0x2000,0x2000,0x2000,0x2000
 #ifndef __APPLE__
 .data
 #else
@@ -14261,14 +13147,8 @@ L_mlkem_compress_11_avx2_shift13:
 .p2align	4
 #endif /* __APPLE__ */
 L_mlkem_compress_11_avx2_mask:
-.value	0x07ff,0x07ff
-.value	0x07ff,0x07ff
-.value	0x07ff,0x07ff
-.value	0x07ff,0x07ff
-.value	0x07ff,0x07ff
-.value	0x07ff,0x07ff
-.value	0x07ff,0x07ff
-.value	0x07ff,0x07ff
+.short	0x07ff,0x07ff,0x07ff,0x07ff,0x07ff,0x07ff,0x07ff,0x07ff
+.short	0x07ff,0x07ff,0x07ff,0x07ff,0x07ff,0x07ff,0x07ff,0x07ff
 #ifndef __APPLE__
 .data
 #else
@@ -14280,13 +13160,18 @@ L_mlkem_compress_11_avx2_mask:
 .p2align	5
 #endif /* __APPLE__ */
 L_mlkem_compress_11_avx2_shift:
-.quad	0x800000108000001, 0x800000108000001
-.quad	0x800000108000001, 0x800000108000001
+.quad	0x0800000108000001,0x0800000108000001
+.quad	0x0800000108000001,0x0800000108000001
 #ifndef __APPLE__
 .data
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_mlkem_compress_11_avx2_sllvd:
 .long	0x0000000a,0x00000000,0x0000000a,0x00000000
 .long	0x0000000a,0x00000000,0x0000000a,0x00000000
@@ -14301,8 +13186,8 @@ L_mlkem_compress_11_avx2_sllvd:
 .p2align	5
 #endif /* __APPLE__ */
 L_mlkem_compress_11_avx2_srlvq:
-.quad	0xa, 0x1e
-.quad	0xa, 0x1e
+.quad	0x000000000000000a,0x000000000000001e
+.quad	0x000000000000000a,0x000000000000001e
 #ifndef __APPLE__
 .data
 #else
@@ -14314,14 +13199,10 @@ L_mlkem_compress_11_avx2_srlvq:
 .p2align	4
 #endif /* __APPLE__ */
 L_mlkem_compress_11_avx2_shuf:
-.value	0x100,0x302
-.value	0x504,0x706
-.value	0x908,0xff0a
-.value	0xffff,0xffff
-.value	0x605,0x807
-.value	0xa09,0xffff
-.value	0xffff,0x0
-.value	0x201,0x403
+.byte	0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07
+.byte	0x08,0x09,0x0a,0xff,0xff,0xff,0xff,0xff
+.byte	0x05,0x06,0x07,0x08,0x09,0x0a,0xff,0xff
+.byte	0xff,0xff,0x00,0x00,0x01,0x02,0x03,0x04
 #ifndef __APPLE__
 .text
 .globl	mlkem_compress_11_avx2
@@ -14717,14 +13598,8 @@ L_mlkem_compress_11_avx2_start:
 .p2align	4
 #endif /* __APPLE__ */
 L_mlkem_decompress_11_avx2_q:
-.value	0x0d01,0x0d01
-.value	0x0d01,0x0d01
-.value	0x0d01,0x0d01
-.value	0x0d01,0x0d01
-.value	0x0d01,0x0d01
-.value	0x0d01,0x0d01
-.value	0x0d01,0x0d01
-.value	0x0d01,0x0d01
+.short	0x0d01,0x0d01,0x0d01,0x0d01,0x0d01,0x0d01,0x0d01,0x0d01
+.short	0x0d01,0x0d01,0x0d01,0x0d01,0x0d01,0x0d01,0x0d01,0x0d01
 #ifndef __APPLE__
 .data
 #else
@@ -14736,19 +13611,20 @@ L_mlkem_decompress_11_avx2_q:
 .p2align	4
 #endif /* __APPLE__ */
 L_mlkem_decompress_11_avx2_shuf:
-.value	0x100,0x201
-.value	0x302,0x504
-.value	0x605,0x706
-.value	0x908,0xa09
-.value	0x403,0x504
-.value	0x605,0x807
-.value	0x908,0xa09
-.value	0xc0b,0xd0c
+.byte	0x00,0x01,0x01,0x02,0x02,0x03,0x04,0x05
+.byte	0x05,0x06,0x06,0x07,0x08,0x09,0x09,0x0a
+.byte	0x03,0x04,0x04,0x05,0x05,0x06,0x07,0x08
+.byte	0x08,0x09,0x09,0x0a,0x0b,0x0c,0x0c,0x0d
 #ifndef __APPLE__
 .data
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_mlkem_decompress_11_avx2_sllv:
 .long	0x00000000,0x00000001,0x00000000,0x00000000
 .long	0x00000000,0x00000001,0x00000000,0x00000000
@@ -14763,8 +13639,8 @@ L_mlkem_decompress_11_avx2_sllv:
 .p2align	5
 #endif /* __APPLE__ */
 L_mlkem_decompress_11_avx2_srlv:
-.quad	0x0, 0x2
-.quad	0x0, 0x2
+.quad	0x0000000000000000,0x0000000000000002
+.quad	0x0000000000000000,0x0000000000000002
 #ifndef __APPLE__
 .data
 #else
@@ -14776,14 +13652,8 @@ L_mlkem_decompress_11_avx2_srlv:
 .p2align	4
 #endif /* __APPLE__ */
 L_mlkem_decompress_11_avx2_shift:
-.value	0x0020,0x0004
-.value	0x0001,0x0020
-.value	0x0008,0x0001
-.value	0x0020,0x0004
-.value	0x0020,0x0004
-.value	0x0001,0x0020
-.value	0x0008,0x0001
-.value	0x0020,0x0004
+.short	0x0020,0x0004,0x0001,0x0020,0x0008,0x0001,0x0020,0x0004
+.short	0x0020,0x0004,0x0001,0x0020,0x0008,0x0001,0x0020,0x0004
 #ifndef __APPLE__
 .data
 #else
@@ -14795,14 +13665,8 @@ L_mlkem_decompress_11_avx2_shift:
 .p2align	4
 #endif /* __APPLE__ */
 L_mlkem_decompress_11_avx2_mask:
-.value	0x7ff0,0x7ff0
-.value	0x7ff0,0x7ff0
-.value	0x7ff0,0x7ff0
-.value	0x7ff0,0x7ff0
-.value	0x7ff0,0x7ff0
-.value	0x7ff0,0x7ff0
-.value	0x7ff0,0x7ff0
-.value	0x7ff0,0x7ff0
+.short	0x7ff0,0x7ff0,0x7ff0,0x7ff0,0x7ff0,0x7ff0,0x7ff0,0x7ff0
+.short	0x7ff0,0x7ff0,0x7ff0,0x7ff0,0x7ff0,0x7ff0,0x7ff0,0x7ff0
 #ifndef __APPLE__
 .text
 .globl	mlkem_decompress_11_avx2
@@ -14986,14 +13850,8 @@ L_mlkem_decompress_11_avx2_start:
 .p2align	4
 #endif /* __APPLE__ */
 L_mlkem_compress_4_avx2_mask:
-.value	0x000f,0x000f
-.value	0x000f,0x000f
-.value	0x000f,0x000f
-.value	0x000f,0x000f
-.value	0x000f,0x000f
-.value	0x000f,0x000f
-.value	0x000f,0x000f
-.value	0x000f,0x000f
+.short	0x000f,0x000f,0x000f,0x000f,0x000f,0x000f,0x000f,0x000f
+.short	0x000f,0x000f,0x000f,0x000f,0x000f,0x000f,0x000f,0x000f
 #ifndef __APPLE__
 .data
 #else
@@ -15005,19 +13863,18 @@ L_mlkem_compress_4_avx2_mask:
 .p2align	4
 #endif /* __APPLE__ */
 L_mlkem_compress_4_avx2_shift:
-.value	0x0200,0x0200
-.value	0x0200,0x0200
-.value	0x0200,0x0200
-.value	0x0200,0x0200
-.value	0x0200,0x0200
-.value	0x0200,0x0200
-.value	0x0200,0x0200
-.value	0x0200,0x0200
+.short	0x0200,0x0200,0x0200,0x0200,0x0200,0x0200,0x0200,0x0200
+.short	0x0200,0x0200,0x0200,0x0200,0x0200,0x0200,0x0200,0x0200
 #ifndef __APPLE__
 .data
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_mlkem_compress_4_avx2_perm:
 .long	0x00000000,0x00000004,0x00000001,0x00000005
 .long	0x00000002,0x00000006,0x00000003,0x00000007
@@ -15032,14 +13889,8 @@ L_mlkem_compress_4_avx2_perm:
 .p2align	4
 #endif /* __APPLE__ */
 L_mlkem_compress_4_avx2_v:
-.value	0x4ebf,0x4ebf
-.value	0x4ebf,0x4ebf
-.value	0x4ebf,0x4ebf
-.value	0x4ebf,0x4ebf
-.value	0x4ebf,0x4ebf
-.value	0x4ebf,0x4ebf
-.value	0x4ebf,0x4ebf
-.value	0x4ebf,0x4ebf
+.short	0x4ebf,0x4ebf,0x4ebf,0x4ebf,0x4ebf,0x4ebf,0x4ebf,0x4ebf
+.short	0x4ebf,0x4ebf,0x4ebf,0x4ebf,0x4ebf,0x4ebf,0x4ebf,0x4ebf
 #ifndef __APPLE__
 .data
 #else
@@ -15051,14 +13902,8 @@ L_mlkem_compress_4_avx2_v:
 .p2align	4
 #endif /* __APPLE__ */
 L_mlkem_compress_4_avx2_shift12:
-.value	0x1001,0x1001
-.value	0x1001,0x1001
-.value	0x1001,0x1001
-.value	0x1001,0x1001
-.value	0x1001,0x1001
-.value	0x1001,0x1001
-.value	0x1001,0x1001
-.value	0x1001,0x1001
+.short	0x1001,0x1001,0x1001,0x1001,0x1001,0x1001,0x1001,0x1001
+.short	0x1001,0x1001,0x1001,0x1001,0x1001,0x1001,0x1001,0x1001
 #ifndef __APPLE__
 .text
 .globl	mlkem_compress_4_avx2
@@ -15162,6 +14007,11 @@ _mlkem_compress_4_avx2:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_mlkem_decompress_4_avx2_mask:
 .long	0x00f0000f,0x00f0000f,0x00f0000f,0x00f0000f
 .long	0x00f0000f,0x00f0000f,0x00f0000f,0x00f0000f
@@ -15170,6 +14020,11 @@ L_mlkem_decompress_4_avx2_mask:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_mlkem_decompress_4_avx2_shift:
 .long	0x00800800,0x00800800,0x00800800,0x00800800
 .long	0x00800800,0x00800800,0x00800800,0x00800800
@@ -15184,14 +14039,8 @@ L_mlkem_decompress_4_avx2_shift:
 .p2align	4
 #endif /* __APPLE__ */
 L_mlkem_decompress_4_avx2_q:
-.value	0x0d01,0x0d01
-.value	0x0d01,0x0d01
-.value	0x0d01,0x0d01
-.value	0x0d01,0x0d01
-.value	0x0d01,0x0d01
-.value	0x0d01,0x0d01
-.value	0x0d01,0x0d01
-.value	0x0d01,0x0d01
+.short	0x0d01,0x0d01,0x0d01,0x0d01,0x0d01,0x0d01,0x0d01,0x0d01
+.short	0x0d01,0x0d01,0x0d01,0x0d01,0x0d01,0x0d01,0x0d01,0x0d01
 #ifndef __APPLE__
 .data
 #else
@@ -15203,14 +14052,10 @@ L_mlkem_decompress_4_avx2_q:
 .p2align	4
 #endif /* __APPLE__ */
 L_mlkem_decompress_4_avx2_shuf:
-.value	0x0,0x0
-.value	0x101,0x101
-.value	0x202,0x202
-.value	0x303,0x303
-.value	0x404,0x404
-.value	0x505,0x505
-.value	0x606,0x606
-.value	0x707,0x707
+.byte	0x00,0x00,0x00,0x00,0x01,0x01,0x01,0x01
+.byte	0x02,0x02,0x02,0x02,0x03,0x03,0x03,0x03
+.byte	0x04,0x04,0x04,0x04,0x05,0x05,0x05,0x05
+.byte	0x06,0x06,0x06,0x06,0x07,0x07,0x07,0x07
 #ifndef __APPLE__
 .text
 .globl	mlkem_decompress_4_avx2
@@ -15339,14 +14184,8 @@ _mlkem_decompress_4_avx2:
 .p2align	4
 #endif /* __APPLE__ */
 L_mlkem_compress_5_avx2_v:
-.value	0x4ebf,0x4ebf
-.value	0x4ebf,0x4ebf
-.value	0x4ebf,0x4ebf
-.value	0x4ebf,0x4ebf
-.value	0x4ebf,0x4ebf
-.value	0x4ebf,0x4ebf
-.value	0x4ebf,0x4ebf
-.value	0x4ebf,0x4ebf
+.short	0x4ebf,0x4ebf,0x4ebf,0x4ebf,0x4ebf,0x4ebf,0x4ebf,0x4ebf
+.short	0x4ebf,0x4ebf,0x4ebf,0x4ebf,0x4ebf,0x4ebf,0x4ebf,0x4ebf
 #ifndef __APPLE__
 .data
 #else
@@ -15358,14 +14197,8 @@ L_mlkem_compress_5_avx2_v:
 .p2align	4
 #endif /* __APPLE__ */
 L_mlkem_compress_5_avx2_shift:
-.value	0x0400,0x0400
-.value	0x0400,0x0400
-.value	0x0400,0x0400
-.value	0x0400,0x0400
-.value	0x0400,0x0400
-.value	0x0400,0x0400
-.value	0x0400,0x0400
-.value	0x0400,0x0400
+.short	0x0400,0x0400,0x0400,0x0400,0x0400,0x0400,0x0400,0x0400
+.short	0x0400,0x0400,0x0400,0x0400,0x0400,0x0400,0x0400,0x0400
 #ifndef __APPLE__
 .data
 #else
@@ -15377,14 +14210,8 @@ L_mlkem_compress_5_avx2_shift:
 .p2align	4
 #endif /* __APPLE__ */
 L_mlkem_compress_5_avx2_mask:
-.value	0x001f,0x001f
-.value	0x001f,0x001f
-.value	0x001f,0x001f
-.value	0x001f,0x001f
-.value	0x001f,0x001f
-.value	0x001f,0x001f
-.value	0x001f,0x001f
-.value	0x001f,0x001f
+.short	0x001f,0x001f,0x001f,0x001f,0x001f,0x001f,0x001f,0x001f
+.short	0x001f,0x001f,0x001f,0x001f,0x001f,0x001f,0x001f,0x001f
 #ifndef __APPLE__
 .data
 #else
@@ -15396,19 +14223,18 @@ L_mlkem_compress_5_avx2_mask:
 .p2align	4
 #endif /* __APPLE__ */
 L_mlkem_compress_5_avx2_shift1:
-.value	0x2001,0x2001
-.value	0x2001,0x2001
-.value	0x2001,0x2001
-.value	0x2001,0x2001
-.value	0x2001,0x2001
-.value	0x2001,0x2001
-.value	0x2001,0x2001
-.value	0x2001,0x2001
+.short	0x2001,0x2001,0x2001,0x2001,0x2001,0x2001,0x2001,0x2001
+.short	0x2001,0x2001,0x2001,0x2001,0x2001,0x2001,0x2001,0x2001
 #ifndef __APPLE__
 .data
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_mlkem_compress_5_avx2_shift2:
 .long	0x04000001,0x04000001,0x04000001,0x04000001
 .long	0x04000001,0x04000001,0x04000001,0x04000001
@@ -15423,8 +14249,8 @@ L_mlkem_compress_5_avx2_shift2:
 .p2align	5
 #endif /* __APPLE__ */
 L_mlkem_compress_5_avx2_shlv:
-.quad	0xc, 0xc
-.quad	0xc, 0xc
+.quad	0x000000000000000c,0x000000000000000c
+.quad	0x000000000000000c,0x000000000000000c
 #ifndef __APPLE__
 .data
 #else
@@ -15436,14 +14262,10 @@ L_mlkem_compress_5_avx2_shlv:
 .p2align	4
 #endif /* __APPLE__ */
 L_mlkem_compress_5_avx2_shuffle:
-.value	0x100,0x302
-.value	0xff04,0xffff
-.value	0xffff,0x908
-.value	0xb0a,0xff0c
-.value	0xa09,0xc0b
-.value	0xff,0x201
-.value	0x403,0xffff
-.value	0xffff,0x8ff
+.byte	0x00,0x01,0x02,0x03,0x04,0xff,0xff,0xff
+.byte	0xff,0xff,0x08,0x09,0x0a,0x0b,0x0c,0xff
+.byte	0x09,0x0a,0x0b,0x0c,0xff,0x00,0x01,0x02
+.byte	0x03,0x04,0xff,0xff,0xff,0xff,0xff,0x08
 #ifndef __APPLE__
 .text
 .globl	mlkem_compress_5_avx2
@@ -15608,14 +14430,8 @@ _mlkem_compress_5_avx2:
 .p2align	4
 #endif /* __APPLE__ */
 L_mlkem_decompress_5_avx2_q:
-.value	0x0d01,0x0d01
-.value	0x0d01,0x0d01
-.value	0x0d01,0x0d01
-.value	0x0d01,0x0d01
-.value	0x0d01,0x0d01
-.value	0x0d01,0x0d01
-.value	0x0d01,0x0d01
-.value	0x0d01,0x0d01
+.short	0x0d01,0x0d01,0x0d01,0x0d01,0x0d01,0x0d01,0x0d01,0x0d01
+.short	0x0d01,0x0d01,0x0d01,0x0d01,0x0d01,0x0d01,0x0d01,0x0d01
 #ifndef __APPLE__
 .data
 #else
@@ -15627,14 +14443,10 @@ L_mlkem_decompress_5_avx2_q:
 .p2align	4
 #endif /* __APPLE__ */
 L_mlkem_decompress_5_avx2_shuf:
-.value	0x0,0x100
-.value	0x101,0x201
-.value	0x302,0x303
-.value	0x403,0x404
-.value	0x505,0x605
-.value	0x606,0x706
-.value	0x807,0x808
-.value	0x908,0x909
+.byte	0x00,0x00,0x00,0x01,0x01,0x01,0x01,0x02
+.byte	0x02,0x03,0x03,0x03,0x03,0x04,0x04,0x04
+.byte	0x05,0x05,0x05,0x06,0x06,0x06,0x06,0x07
+.byte	0x07,0x08,0x08,0x08,0x08,0x09,0x09,0x09
 #ifndef __APPLE__
 .data
 #else
@@ -15646,14 +14458,8 @@ L_mlkem_decompress_5_avx2_shuf:
 .p2align	4
 #endif /* __APPLE__ */
 L_mlkem_decompress_5_avx2_mask:
-.value	0x001f,0x03e0
-.value	0x007c,0x0f80
-.value	0x01f0,0x003e
-.value	0x07c0,0x00fb
-.value	0x001f,0x03e0
-.value	0x007c,0x0f80
-.value	0x01f0,0x003e
-.value	0x07c0,0x00fb
+.short	0x001f,0x03e0,0x007c,0x0f80,0x01f0,0x003e,0x07c0,0x00fb
+.short	0x001f,0x03e0,0x007c,0x0f80,0x01f0,0x003e,0x07c0,0x00fb
 #ifndef __APPLE__
 .data
 #else
@@ -15665,14 +14471,8 @@ L_mlkem_decompress_5_avx2_mask:
 .p2align	4
 #endif /* __APPLE__ */
 L_mlkem_decompress_5_avx2_shift:
-.value	0x0400,0x0020
-.value	0x0100,0x0008
-.value	0x0040,0x0200
-.value	0x0010,0x0080
-.value	0x0400,0x0020
-.value	0x0100,0x0008
-.value	0x0040,0x0200
-.value	0x0010,0x0080
+.short	0x0400,0x0020,0x0100,0x0008,0x0040,0x0200,0x0010,0x0080
+.short	0x0400,0x0020,0x0100,0x0008,0x0040,0x0200,0x0010,0x0080
 #ifndef __APPLE__
 .text
 .globl	mlkem_decompress_5_avx2
@@ -15798,6 +14598,11 @@ _mlkem_decompress_5_avx2:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_mlkem_from_msg_avx2_shift:
 .long	0x00000003,0x00000002,0x00000001,0x00000000
 .long	0x00000003,0x00000002,0x00000001,0x00000000
@@ -15812,14 +14617,10 @@ L_mlkem_from_msg_avx2_shift:
 .p2align	4
 #endif /* __APPLE__ */
 L_mlkem_from_msg_avx2_shuf:
-.value	0x100,0x504
-.value	0x908,0xd0c
-.value	0x302,0x706
-.value	0xb0a,0xf0e
-.value	0x100,0x504
-.value	0x908,0xd0c
-.value	0x302,0x706
-.value	0xb0a,0xf0e
+.byte	0x00,0x01,0x04,0x05,0x08,0x09,0x0c,0x0d
+.byte	0x02,0x03,0x06,0x07,0x0a,0x0b,0x0e,0x0f
+.byte	0x00,0x01,0x04,0x05,0x08,0x09,0x0c,0x0d
+.byte	0x02,0x03,0x06,0x07,0x0a,0x0b,0x0e,0x0f
 #ifndef __APPLE__
 .data
 #else
@@ -15831,14 +14632,8 @@ L_mlkem_from_msg_avx2_shuf:
 .p2align	4
 #endif /* __APPLE__ */
 L_mlkem_from_msg_avx2_hqs:
-.value	0x0681,0x0681
-.value	0x0681,0x0681
-.value	0x0681,0x0681
-.value	0x0681,0x0681
-.value	0x0681,0x0681
-.value	0x0681,0x0681
-.value	0x0681,0x0681
-.value	0x0681,0x0681
+.short	0x0681,0x0681,0x0681,0x0681,0x0681,0x0681,0x0681,0x0681
+.short	0x0681,0x0681,0x0681,0x0681,0x0681,0x0681,0x0681,0x0681
 #ifndef __APPLE__
 .text
 .globl	mlkem_from_msg_avx2
@@ -15975,14 +14770,8 @@ _mlkem_from_msg_avx2:
 .p2align	4
 #endif /* __APPLE__ */
 L_mlkem_to_msg_avx2_hqs:
-.value	0x0680,0x0680
-.value	0x0680,0x0680
-.value	0x0680,0x0680
-.value	0x0680,0x0680
-.value	0x0680,0x0680
-.value	0x0680,0x0680
-.value	0x0680,0x0680
-.value	0x0680,0x0680
+.short	0x0680,0x0680,0x0680,0x0680,0x0680,0x0680,0x0680,0x0680
+.short	0x0680,0x0680,0x0680,0x0680,0x0680,0x0680,0x0680,0x0680
 #ifndef __APPLE__
 .data
 #else
@@ -15994,14 +14783,8 @@ L_mlkem_to_msg_avx2_hqs:
 .p2align	4
 #endif /* __APPLE__ */
 L_mlkem_to_msg_avx2_hhqs:
-.value	0xfcc1,0xfcc1
-.value	0xfcc1,0xfcc1
-.value	0xfcc1,0xfcc1
-.value	0xfcc1,0xfcc1
-.value	0xfcc1,0xfcc1
-.value	0xfcc1,0xfcc1
-.value	0xfcc1,0xfcc1
-.value	0xfcc1,0xfcc1
+.short	0xfcc1,0xfcc1,0xfcc1,0xfcc1,0xfcc1,0xfcc1,0xfcc1,0xfcc1
+.short	0xfcc1,0xfcc1,0xfcc1,0xfcc1,0xfcc1,0xfcc1,0xfcc1,0xfcc1
 #ifndef __APPLE__
 .text
 .globl	mlkem_to_msg_avx2
@@ -16128,19 +14911,20 @@ _mlkem_to_msg_avx2:
 .p2align	4
 #endif /* __APPLE__ */
 L_mlkem_from_bytes_avx2_shuf:
-.value	0x100,0xff02
-.value	0x403,0xff05
-.value	0x706,0xff08
-.value	0xa09,0xff0b
-.value	0x504,0xff06
-.value	0x807,0xff09
-.value	0xb0a,0xff0c
-.value	0xe0d,0xff0f
+.byte	0x00,0x01,0x02,0xff,0x03,0x04,0x05,0xff
+.byte	0x06,0x07,0x08,0xff,0x09,0x0a,0x0b,0xff
+.byte	0x04,0x05,0x06,0xff,0x07,0x08,0x09,0xff
+.byte	0x0a,0x0b,0x0c,0xff,0x0d,0x0e,0x0f,0xff
 #ifndef __APPLE__
 .data
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_mlkem_from_bytes_avx2_mask:
 .long	0x00000fff,0x00000fff,0x00000fff,0x00000fff
 .long	0x00000fff,0x00000fff,0x00000fff,0x00000fff
@@ -16309,6 +15093,11 @@ _mlkem_from_bytes_avx2:
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_mlkem_to_bytes_avx2_mask:
 .long	0x00000fff,0x00000fff,0x00000fff,0x00000fff
 .long	0x00000fff,0x00000fff,0x00000fff,0x00000fff
@@ -16323,19 +15112,20 @@ L_mlkem_to_bytes_avx2_mask:
 .p2align	4
 #endif /* __APPLE__ */
 L_mlkem_to_bytes_avx2_shuf:
-.value	0x100,0x402
-.value	0x605,0x908
-.value	0xc0a,0xe0d
-.value	0xffff,0xffff
-.value	0x605,0x908
-.value	0xc0a,0xe0d
-.value	0xffff,0xffff
-.value	0x100,0x402
+.byte	0x00,0x01,0x02,0x04,0x05,0x06,0x08,0x09
+.byte	0x0a,0x0c,0x0d,0x0e,0xff,0xff,0xff,0xff
+.byte	0x05,0x06,0x08,0x09,0x0a,0x0c,0x0d,0x0e
+.byte	0xff,0xff,0xff,0xff,0x00,0x01,0x02,0x04
 #ifndef __APPLE__
 .data
 #else
 .section	__DATA,__data
 #endif /* __APPLE__ */
+#ifndef __APPLE__
+.align	16
+#else
+.p2align	4
+#endif /* __APPLE__ */
 L_mlkem_to_bytes_avx2_perm:
 .long	0x00000000,0x00000001,0x00000002,0x00000007
 .long	0x00000004,0x00000005,0x00000003,0x00000006