Skip to content

Commit

Permalink
Optimize chacha20 for aarch64.
Browse files Browse the repository at this point in the history
Previous 6 * NEON + 2 * ALU code path is optimized for thunderx2, and
is suboptimal on most other platforms. Detecting micro architecture
at runtime and choosing suitable code path can help achieve best
performance.

This PR changes code path into 4 * NEON + 1 * ALU for A53, A55, A57
and A72(which is also commonly used in arm servers) cores.
Then chacha20_neon processes 320 bytes data at a time, and has
better overall performance.

Use MIDR_EL1 system register to determine cpu core at runtime.
Based on PR openssl#11744.

Peformance changes after applying optimization:
                          A55     A53    A57    A72
chacha20@8192             +10.2%  +9.1%  +5.8%  +4.3%
chacha20@16384            +10.4%  +9.2%  +5.7%  +4.3%
chacha20-poly1305@8192    +7.4%   +6.7%  +4.6%  +3.3%
chacha20-poly1305@16384   +7.5%   +6.9%  +4.8%  +3.6%

Other cores don't change code path, and performance remains the
same(tested on Qualcomm SDA845).

Change-Id: I844b9fcadd94595db0007bb0dbdff8548c47775e
CustomizedGitHooks: yes
  • Loading branch information
xffbai committed Dec 10, 2020
1 parent 5ea64b4 commit c89c00d
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 0 deletions.
4 changes: 4 additions & 0 deletions crypto/arm_arch.h
Expand Up @@ -72,6 +72,7 @@
# ifndef __ASSEMBLER__
extern unsigned int OPENSSL_armcap_P;
extern unsigned int OPENSSL_arm_midr;
extern unsigned int OPENSSL_arm_chacha_choose;
# endif

# define ARMV7_NEON (1<<0)
Expand All @@ -95,6 +96,9 @@ extern unsigned int OPENSSL_arm_midr;

# define ARM_CPU_IMP_ARM 0x41

# define ARM_CPU_PART_CORTEX_A53 0xD03
# define ARM_CPU_PART_CORTEX_A55 0xD05
# define ARM_CPU_PART_CORTEX_A57 0xD07
# define ARM_CPU_PART_CORTEX_A72 0xD08
# define ARM_CPU_PART_N1 0xD0C

Expand Down
8 changes: 8 additions & 0 deletions crypto/armcap.c
Expand Up @@ -19,6 +19,7 @@

unsigned int OPENSSL_armcap_P = 0;
unsigned int OPENSSL_arm_midr = 0;
unsigned int OPENSSL_arm_chacha_choose = 0;

#if __ARM_MAX_ARCH__<7
void OPENSSL_cpuid_setup(void)
Expand Down Expand Up @@ -220,6 +221,13 @@ void OPENSSL_cpuid_setup(void)
# ifdef __aarch64__
if (OPENSSL_armcap_P & ARMV8_CPUID)
OPENSSL_arm_midr = _armv8_cpuid_probe();

if (MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A53)
|| MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A55)
|| MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A57)
|| MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A72)) {
OPENSSL_arm_chacha_choose = 1;
}
# endif
}
#endif
19 changes: 19 additions & 0 deletions crypto/chacha/asm/chacha-armv8.pl
Expand Up @@ -43,6 +43,15 @@
# ThunderX2 7.22/+48% 5.64 4.10
#
# (*) slower than 4+1:-(
#
#
# September 2020
#
# Use MIDR_EL1 system register to determine cpu core at runtime, and
# code path is chosen accordingly.
#
# Micro architectures using 4xNEON+1xIALU: Cortex-A53, Cortex-A55, Cortex-A57,
# and Cortex-A72. Other platforms still in 6xNEON+2xIALU.

# $output is the last argument if it looks like a file (it has an extension)
# $flavour is the first argument if it doesn't look like a file
Expand Down Expand Up @@ -136,6 +145,8 @@ sub ROUND {
# include "arm_arch.h"
.extern OPENSSL_armcap_P
.hidden OPENSSL_armcap_P
.extern OPENSSL_arm_chacha_choose
.hidden OPENSSL_arm_chacha_choose
#endif
.text
Expand Down Expand Up @@ -443,9 +454,17 @@ sub NEON_lane_ROUND {
stp x23,x24,[sp,#48]
stp x25,x26,[sp,#64]
stp x27,x28,[sp,#80]
#ifndef __KERNEL__
adrp x17,OPENSSL_arm_chacha_choose
ldr w17,[x17,#:lo12:OPENSSL_arm_chacha_choose]
cbnz w17,.Loop_320_neon
#endif
cmp $len,#512
b.hs .L512_or_more_neon
.Loop_320_neon:
sub sp,sp,#64
ldp @d[0],@d[1],[@x[0]] // load sigma
Expand Down

0 comments on commit c89c00d

Please sign in to comment.