asm/ct_inverse_mod_256-*.pl: fix another corner case.

dot-asm · dot-asm · commit fd453524b12c · 2022-01-25T17:57:24.000+01:00
Thanks to Guido Vranken for report.
diff --git a/src/asm/ct_inverse_mod_256-armv8.pl b/src/asm/ct_inverse_mod_256-armv8.pl
@@ -230,19 +230,31 @@
 	and	@t[7],   @acc[7], @t[0]
 	adcs	@acc[2], @acc[2], @t[6]
 	adcs	@acc[3], @t[3],   @t[7]
-	adc	@t[1], @t[1], xzr		// @t[1] is 1 or 0
+	adc	@t[1], @t[1], xzr		// @t[1] is 1, 0 or -1
 
 	neg	@t[0], @t[1]
-
-	and	@acc[4], @acc[4], @t[0]		// subtract mod<<256 conditionally
-	and	@acc[5], @acc[5], @t[0]
-	subs	@acc[0], @acc[0], @acc[4]
-	and	@acc[6], @acc[6], @t[0]
-	sbcs	@acc[1], @acc[1], @acc[5]
-	and	@acc[7], @acc[7], @t[0]
-	sbcs	@acc[2], @acc[2], @acc[6]
+	orr	@t[1], @t[1], @t[0]		// excess bit or sign as mask
+	asr	@t[0], @t[0], #63		// excess bit as mask
+
+	and	@acc[4], @acc[4], @t[1]		// mask |mod|
+	and	@acc[5], @acc[5], @t[1]
+	and	@acc[6], @acc[6], @t[1]
+	and	@acc[7], @acc[7], @t[1]
+
+	eor	@acc[4], @acc[4], @t[0]		// conditionally negate |mod|
+	eor	@acc[5], @acc[5], @t[0]
+	adds	@acc[4], @acc[4], @t[0], lsr#63
+	eor	@acc[6], @acc[6], @t[0]
+	adcs	@acc[5], @acc[5], xzr
+	eor	@acc[7], @acc[7], @t[0]
+	adcs	@acc[6], @acc[6], xzr
+	adc	@acc[7], @acc[7], xzr
+
+	adds	@acc[0], @acc[0], @acc[4]	// final adjustment for |mod|<<256
+	adcs	@acc[1], @acc[1], @acc[5]
+	adcs	@acc[2], @acc[2], @acc[6]
 	stp	@acc[0], @acc[1], [$out_ptr,#8*4]
-	sbcs	@acc[3], @acc[3], @acc[7]
+	adc	@acc[3], @acc[3], @acc[7]
 	stp	@acc[2], @acc[3], [$out_ptr,#8*6]
 
 	add	sp, sp, #$frame
diff --git a/src/asm/ct_inverse_mod_256-x86_64.pl b/src/asm/ct_inverse_mod_256-x86_64.pl
@@ -297,20 +297,34 @@
 	adc	%rdx,    @acc[7]
 	adc	\$0,     %rax
 
-	neg	%rax			# excess bit as mask
+	mov	%rax, %rdx
+	neg	%rax
+	or	%rax, %rdx		# excess bit or sign as mask
+	sar	\$63, %rax		# excess bit as mask
 
-	mov	%rax, @acc[0]		# mask |modulus|
-	mov	%rax, @acc[1]
+	mov	%rdx, @acc[0]		# mask |modulus|
+	mov	%rdx, @acc[1]
 	and	8*0($in_ptr), @acc[0]
-	mov	%rax, @acc[2]
+	mov	%rdx, @acc[2]
 	and	8*1($in_ptr), @acc[1]
 	and	8*2($in_ptr), @acc[2]
-	and	8*3($in_ptr), %rax
+	and	8*3($in_ptr), %rdx
 
-	sub	@acc[0], @acc[4]	# conditionally subtract |modulus|<<256
-	sbb	@acc[1], @acc[5]
-	sbb	@acc[2], @acc[6]
-	sbb	%rax,    @acc[7]
+	xor	%rax, @acc[0]		# conditionally negate |modulus|
+	xor	%rcx, %rcx
+	xor	%rax, @acc[1]
+	sub	%rax, %rcx
+	xor	%rax, @acc[2]
+	xor	%rax, %rdx
+	add	%rcx, @acc[0]
+	adc	\$0, @acc[1]
+	adc	\$0, @acc[2]
+	adc	\$0, %rdx
+
+	add	@acc[0], @acc[4]	# final adjustment for |modulus|<<256
+	adc	@acc[1], @acc[5]
+	adc	@acc[2], @acc[6]
+	adc	%rdx,    @acc[7]
 
 	mov	@acc[4], 8*4($out_ptr)	# store absolute value
 	mov	@acc[5], 8*5($out_ptr)