From 9e31ad4fbc4f594a8e302d7c29f8a6e33362f3de Mon Sep 17 00:00:00 2001 From: Nathan Moinvaziri Date: Sun, 9 Jan 2022 17:27:20 -0800 Subject: [PATCH] Move unaligned access detection into zutil.h Provide zmemcmp and zmemcpy. --- CMakeLists.txt | 42 -------------------------------- README.md | 1 - arch/x86/compare258_avx.c | 4 +-- arch/x86/compare258_sse.c | 4 +-- arch/x86/x86.c | 8 +++--- compare258.c | 20 +++++++-------- configure | 16 ------------ functable.c | 1 + insert_string_tpl.h | 4 +-- match_tpl.h | 37 ++++++++++++++++++---------- win32/Makefile.a64 | 2 -- win32/Makefile.arm | 1 - win32/Makefile.msc | 2 -- zutil.h | 51 ++++++++++++++++++++++++++++++++------- 14 files changed, 87 insertions(+), 106 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ecb9ac6b27..e64c708baf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -251,46 +251,6 @@ if(NOT MSVC AND NOT CMAKE_C_FLAGS MATCHES "([\\/\\-]O)3") CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE}") endif() -# Set architecture alignment requirements -if(WITH_UNALIGNED) - if((BASEARCH_ARM_FOUND AND NOT "${ARCH}" MATCHES "armv[2-7]") OR (BASEARCH_PPC_FOUND AND "${ARCH}" MATCHES "powerpc64le") OR BASEARCH_X86_FOUND) - if(NOT DEFINED UNALIGNED_OK) - set(UNALIGNED_OK TRUE) - endif() - endif() - if(UNALIGNED_OK) - add_definitions(-DUNALIGNED_OK) - message(STATUS "Architecture supports unaligned reads") - endif() - if(BASEARCH_ARM_FOUND) - if(NOT DEFINED UNALIGNED64_OK) - if("${ARCH}" MATCHES "armv[2-7]") - set(UNALIGNED64_OK FALSE) - elseif("${ARCH}" MATCHES "(arm(v[8-9])?|aarch64)") - set(UNALIGNED64_OK TRUE) - endif() - endif() - endif() - if(BASEARCH_PPC_FOUND) - if(NOT DEFINED UNALIGNED64_OK) - if("${ARCH}" MATCHES "powerpc64le") - set(UNALIGNED64_OK TRUE) - endif() - endif() - endif() - if(BASEARCH_X86_FOUND) - if(NOT DEFINED UNALIGNED64_OK) - set(UNALIGNED64_OK TRUE) - endif() - endif() - if(UNALIGNED64_OK) - add_definitions(-DUNALIGNED64_OK) - message(STATUS "Architecture supports unaligned reads of > 4 bytes") - endif() -else() - message(STATUS "Unaligned reads manually disabled") -endif() - # Apply warning compiler flags if(WITH_MAINTAINER_WARNINGS) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${WARNFLAGS} ${WARNFLAGS_MAINTAINER} ${WARNFLAGS_DISABLE}") @@ -1454,8 +1414,6 @@ add_feature_info(WITH_MAINTAINER_WARNINGS WITH_MAINTAINER_WARNINGS "Build with p add_feature_info(WITH_CODE_COVERAGE WITH_CODE_COVERAGE "Enable code coverage reporting") add_feature_info(WITH_INFLATE_STRICT WITH_INFLATE_STRICT "Build with strict inflate distance checking") add_feature_info(WITH_INFLATE_ALLOW_INVALID_DIST WITH_INFLATE_ALLOW_INVALID_DIST "Build with zero fill for inflate invalid distances") -add_feature_info(WITH_UNALIGNED UNALIGNED_OK "Support unaligned reads on platforms that support it") -add_feature_info(WITH_UNALIGNED64 UNALIGNED64_OK "Support unaligned 64-bit reads on platforms that support it") if(BASEARCH_ARM_FOUND) add_feature_info(WITH_ACLE WITH_ACLE "Build with ACLE") diff --git a/README.md b/README.md index 79d6648bdb..50fb7a7492 100644 --- a/README.md +++ b/README.md @@ -194,7 +194,6 @@ Advanced Build Options | CMake | configure | Description | Default | |:--------------------------------|:----------------------|:--------------------------------------------------------------------|------------------------| | ZLIB_DUAL_LINK | | Dual link tests with system zlib | OFF | -| UNALIGNED_OK | | Allow unaligned reads | ON (x86, arm) | | | --force-sse2 | Skip runtime check for SSE2 instructions (Always on for x86_64) | OFF (x86) | | WITH_AVX2 | | Build with AVX2 intrinsics | ON | | WITH_AVX512 | | Build with AVX512 intrinsics | ON | diff --git a/arch/x86/compare258_avx.c b/arch/x86/compare258_avx.c index 4fafc4c6d6..e5715c854a 100644 --- a/arch/x86/compare258_avx.c +++ b/arch/x86/compare258_avx.c @@ -15,7 +15,7 @@ # include #endif -/* UNALIGNED_OK, AVX2 intrinsic comparison */ +/* AVX2 unaligned intrinsic comparison */ static inline uint32_t compare256_unaligned_avx2_static(const unsigned char *src0, const unsigned char *src1) { uint32_t len = 0; @@ -48,7 +48,7 @@ static inline uint32_t compare256_unaligned_avx2_static(const unsigned char *src } static inline uint32_t compare258_unaligned_avx2_static(const unsigned char *src0, const unsigned char *src1) { - if (*(uint16_t *)src0 != *(uint16_t *)src1) + if (zmemcmp_2(src0, src1) == 0) return (*src0 == *src1); return compare256_unaligned_avx2_static(src0+2, src1+2) + 2; diff --git a/arch/x86/compare258_sse.c b/arch/x86/compare258_sse.c index f0accfd831..8c2148d2c1 100644 --- a/arch/x86/compare258_sse.c +++ b/arch/x86/compare258_sse.c @@ -25,7 +25,7 @@ # include #endif -/* UNALIGNED_OK, SSE4.2 intrinsic comparison */ +/* SSE4.2 unaligned intrinsic comparison */ static inline uint32_t compare256_unaligned_sse4_static(const unsigned char *src0, const unsigned char *src1) { uint32_t len = 0; @@ -55,7 +55,7 @@ static inline uint32_t compare256_unaligned_sse4_static(const unsigned char *src } static inline uint32_t compare258_unaligned_sse4_static(const unsigned char *src0, const unsigned char *src1) { - if (*(uint16_t *)src0 != *(uint16_t *)src1) + if (zmemcmp_2(src0, src1) == 0) return (*src0 == *src1); return compare256_unaligned_sse4_static(src0+2, src1+2) + 2; diff --git a/arch/x86/x86.c b/arch/x86/x86.c index 32baf8a749..253caf91cb 100644 --- a/arch/x86/x86.c +++ b/arch/x86/x86.c @@ -70,9 +70,9 @@ void Z_INTERNAL x86_check_features(void) { /* NULL terminate the string */ memset(cpu_identity, 0, 13); - memcpy(cpu_identity, (char*)&ebx, sizeof(int)); - memcpy(cpu_identity + 4, (char*)&edx, sizeof(int)); - memcpy(cpu_identity + 8, (char*)&ecx, sizeof(int)); + zmemcpy_4(cpu_identity, (char *)&ebx); + zmemcpy_4(cpu_identity + 4, (char *)&edx); + zmemcpy_4(cpu_identity + 8, (char *)&ecx); intel_cpu = strncmp(cpu_identity, "GenuineIntel", 12) == 0; @@ -119,7 +119,7 @@ void Z_INTERNAL x86_check_features(void) { x86_cpu_well_suited_avx512 = 1; } else if (model == 0xa && extended_model == 0x6) { /* Icelake server */ - x86_cpu_well_suited_avx512 = 1; + x86_cpu_well_suited_avx512 = 1; } else if (model == 0xf && extended_model == 0x8) { /* Saphire rapids */ x86_cpu_well_suited_avx512 = 1; diff --git a/compare258.c b/compare258.c index 6bb5c90615..df825cd830 100644 --- a/compare258.c +++ b/compare258.c @@ -69,21 +69,21 @@ Z_INTERNAL uint32_t compare258_c(const unsigned char *src0, const unsigned char #include "match_tpl.h" #ifdef UNALIGNED_OK -/* UNALIGNED_OK, 16-bit integer comparison */ +/* 16-bit unaligned integer comparison */ static inline uint32_t compare256_unaligned_16_static(const unsigned char *src0, const unsigned char *src1) { uint32_t len = 0; do { - if (*(uint16_t *)src0 != *(uint16_t *)src1) + if (zmemcmp_2(src0, src1) == 0) return len + (*src0 == *src1); src0 += 2, src1 += 2, len += 2; - if (*(uint16_t *)src0 != *(uint16_t *)src1) + if (zmemcmp_2(src0, src1) == 0) return len + (*src0 == *src1); src0 += 2, src1 += 2, len += 2; - if (*(uint16_t *)src0 != *(uint16_t *)src1) + if (zmemcmp_2(src0, src1) == 0) return len + (*src0 == *src1); src0 += 2, src1 += 2, len += 2; - if (*(uint16_t *)src0 != *(uint16_t *)src1) + if (zmemcmp_2(src0, src1) == 0) return len + (*src0 == *src1); src0 += 2, src1 += 2, len += 2; } while (len < 256); @@ -92,7 +92,7 @@ static inline uint32_t compare256_unaligned_16_static(const unsigned char *src0, } static inline uint32_t compare258_unaligned_16_static(const unsigned char *src0, const unsigned char *src1) { - if (*(uint16_t *)src0 != *(uint16_t *)src1) + if (zmemcmp_2(src0, src1) == 0) return (*src0 == *src1); return compare256_unaligned_16_static(src0+2, src1+2) + 2; @@ -114,7 +114,7 @@ Z_INTERNAL uint32_t compare258_unaligned_16(const unsigned char *src0, const uns #include "match_tpl.h" #ifdef HAVE_BUILTIN_CTZ -/* UNALIGNED_OK, 32-bit integer comparison */ +/* 32-bit unaligned integer comparison */ static inline uint32_t compare256_unaligned_32_static(const unsigned char *src0, const unsigned char *src1) { uint32_t len = 0; @@ -137,7 +137,7 @@ static inline uint32_t compare256_unaligned_32_static(const unsigned char *src0, } static inline uint32_t compare258_unaligned_32_static(const unsigned char *src0, const unsigned char *src1) { - if (*(uint16_t *)src0 != *(uint16_t *)src1) + if (zmemcmp_2(src0, src1) == 0) return (*src0 == *src1); return compare256_unaligned_32_static(src0+2, src1+2) + 2; @@ -161,7 +161,7 @@ Z_INTERNAL uint32_t compare258_unaligned_32(const unsigned char *src0, const uns #endif #if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL) -/* UNALIGNED64_OK, 64-bit integer comparison */ +/* 64-bit unaligned integer comparison */ static inline uint32_t compare256_unaligned_64_static(const unsigned char *src0, const unsigned char *src1) { uint32_t len = 0; @@ -184,7 +184,7 @@ static inline uint32_t compare256_unaligned_64_static(const unsigned char *src0, } static inline uint32_t compare258_unaligned_64_static(const unsigned char *src0, const unsigned char *src1) { - if (*(uint16_t *)src0 != *(uint16_t *)src1) + if (zmemcmp_2(src0, src1) == 0) return (*src0 == *src1); return compare256_unaligned_64_static(src0+2, src1+2) + 2; diff --git a/configure b/configure index 143bc6bf3f..666b949e64 100755 --- a/configure +++ b/configure @@ -1422,9 +1422,6 @@ case "${ARCH}" in i386 | i486 | i586 | i686 |x86_64) ARCHDIR=arch/x86 - CFLAGS="${CFLAGS} -DUNALIGNED_OK -DUNALIGNED64_OK" - SFLAGS="${SFLAGS} -DUNALIGNED_OK -DUNALIGNED64_OK" - # Enable arch-specific optimizations if test $without_optimizations -eq 0; then CFLAGS="${CFLAGS} -DX86_FEATURES" @@ -1646,8 +1643,6 @@ EOF fi ;; armv6l | armv6hl) - CFLAGS="${CFLAGS} -DUNALIGNED_OK" - SFLAGS="${SFLAGS} -DUNALIGNED_OK" if test $without_optimizations -eq 0; then if test $buildacle -eq 1; then @@ -1660,8 +1655,6 @@ EOF fi ;; arm | armv7*) - CFLAGS="${CFLAGS} -DUNALIGNED_OK" - SFLAGS="${SFLAGS} -DUNALIGNED_OK" if test $without_optimizations -eq 0; then if test $buildacle -eq 1; then @@ -1682,8 +1675,6 @@ EOF fi ;; armv8-a | armv8-a+simd) - CFLAGS="${CFLAGS} -DUNALIGNED_OK -DUNALIGNED64_OK" - SFLAGS="${SFLAGS} -DUNALIGNED_OK -DUNALIGNED64_OK" if test $without_optimizations -eq 0; then if test $buildacle -eq 1; then @@ -1704,8 +1695,6 @@ EOF fi ;; armv8-a+crc | armv8-a+crc+simd | armv8.[1234]-a | armv8.[1234]-a+simd) - CFLAGS="${CFLAGS} -DUNALIGNED_OK -DUNALIGNED64_OK" - SFLAGS="${SFLAGS} -DUNALIGNED_OK -DUNALIGNED64_OK" acleflag="-march=${ARCH}" @@ -1787,9 +1776,6 @@ EOF neonflag="-march=${ARCH}" acleflag="-march=${ARCH}" - - CFLAGS="${CFLAGS} -DUNALIGNED_OK -DUNALIGNED64_OK" - SFLAGS="${SFLAGS} -DUNALIGNED_OK -DUNALIGNED64_OK" ;; powerpc*) case "${ARCH}" in @@ -1801,8 +1787,6 @@ EOF ;; powerpc64le) [ ! -z $CROSS_PREFIX ] && QEMU_ARCH=ppc64le - CFLAGS="${CFLAGS} -DUNALIGNED_OK -DUNALIGNED64_OK" - SFLAGS="${SFLAGS} -DUNALIGNED_OK -DUNALIGNED64_OK" ;; esac diff --git a/functable.c b/functable.c index ccb0b69a30..1f6c988e40 100644 --- a/functable.c +++ b/functable.c @@ -4,6 +4,7 @@ */ #include "zbuild.h" +#include "zutil.h" #include "zendian.h" #include "crc32_p.h" #include "deflate.h" diff --git a/insert_string_tpl.h b/insert_string_tpl.h index 7d3e46c897..643a5e0e31 100644 --- a/insert_string_tpl.h +++ b/insert_string_tpl.h @@ -29,9 +29,9 @@ # define HASH_CALC_MASK HASH_MASK #endif #ifndef HASH_CALC_READ -# ifdef UNALIGNED_OK +# if BYTE_ORDER == LITTLE_ENDIAN # define HASH_CALC_READ \ - memcpy(&val, strstart, sizeof(val)); + zmemcpy_4(&val, strstart); # else # define HASH_CALC_READ \ val = ((uint32_t)(strstart[0])); \ diff --git a/match_tpl.h b/match_tpl.h index 75181bf851..83173e836f 100644 --- a/match_tpl.h +++ b/match_tpl.h @@ -85,11 +85,16 @@ Z_INTERNAL uint32_t LONGEST_MATCH(deflate_state *const s, Pos cur_match) { } #endif - memcpy(&scan_end, scan+offset, sizeof(bestcmp_t)); -#ifndef UNALIGNED_OK - memcpy(&scan_end0, scan+offset+1, sizeof(bestcmp_t)); +#ifdef UNALIGNED64_OK + zmemcpy_8(&scan_start, scan); + zmemcpy_8(&scan_end, scan+offset); +#elif defined(UNALIGNED_OK) + zmemcpy_4(&scan_start, scan); + zmemcpy_4(&scan_end, scan+offset); #else - memcpy(&scan_start, scan, sizeof(bestcmp_t)); + *scan_start = *(scan); + *scan_end = *(scan+offset); + *scan_end0 = *(scan+offset+1); #endif mbase_end = (mbase_start+offset); @@ -153,24 +158,24 @@ Z_INTERNAL uint32_t LONGEST_MATCH(deflate_state *const s, Pos cur_match) { #ifdef UNALIGNED_OK if (best_len < sizeof(uint32_t)) { for (;;) { - if (*(uint16_t *)(mbase_end+cur_match) == (uint16_t)scan_end && - *(uint16_t *)(mbase_start+cur_match) == (uint16_t)scan_start) + if (zmemcmp_2(mbase_end+cur_match, &scan_end) == 0 && + zmemcmp_2(mbase_start+cur_match, &scan_start) == 0) break; GOTO_NEXT_CHAIN; } # ifdef UNALIGNED64_OK } else if (best_len >= sizeof(uint64_t)) { for (;;) { - if (*(uint64_t *)(mbase_end+cur_match) == (uint64_t)scan_end && - *(uint64_t *)(mbase_start+cur_match) == (uint64_t)scan_start) + if (zmemcmp_8(mbase_end+cur_match, &scan_end) == 0 && + zmemcmp_8(mbase_start+cur_match, &scan_start) == 0) break; GOTO_NEXT_CHAIN; } # endif } else { for (;;) { - if (*(uint32_t *)(mbase_end+cur_match) == (uint32_t)scan_end && - *(uint32_t *)(mbase_start+cur_match) == (uint32_t)scan_start) + if (zmemcmp_4(mbase_end+cur_match, &scan_end) == 0 && + zmemcmp_4(mbase_start+cur_match, &scan_start) == 0) break; GOTO_NEXT_CHAIN; } @@ -207,10 +212,16 @@ Z_INTERNAL uint32_t LONGEST_MATCH(deflate_state *const s, Pos cur_match) { #endif } #endif - memcpy(&scan_end, scan+offset, sizeof(bestcmp_t)); -#ifndef UNALIGNED_OK - memcpy(&scan_end0, scan+offset+1, sizeof(bestcmp_t)); + +#ifdef UNALIGNED64_OK + zmemcpy_8(&scan_end, scan+offset); +#elif defined(UNALIGNED_OK) + zmemcpy_4(&scan_end, scan+offset); +#else + *scan_end = *(scan+offset); + *scan_end0 = *(scan+offset+1); #endif + #ifdef LONGEST_MATCH_SLOW /* Look for a better string offset */ if (UNLIKELY(len > STD_MIN_MATCH && match_start + len < strstart)) { diff --git a/win32/Makefile.a64 b/win32/Makefile.a64 index 580b5bfb1d..01ab96067c 100644 --- a/win32/Makefile.a64 +++ b/win32/Makefile.a64 @@ -27,8 +27,6 @@ CFLAGS = -nologo -MD -W3 -O2 -Oy- -Zi -Fd"zlib" $(LOC) WFLAGS = \ -D_CRT_SECURE_NO_DEPRECATE \ -D_CRT_NONSTDC_NO_DEPRECATE \ - -DUNALIGNED_OK \ - -DUNALIGNED64_OK \ -D_ARM64_WINAPI_PARTITION_DESKTOP_SDK_AVAILABLE=1 \ -DARM_FEATURES \ # diff --git a/win32/Makefile.arm b/win32/Makefile.arm index 12b08dc671..8f943b3708 100644 --- a/win32/Makefile.arm +++ b/win32/Makefile.arm @@ -27,7 +27,6 @@ CFLAGS = -nologo -MD -W3 -O2 -Oy- -Zi -Fd"zlib" $(LOC) WFLAGS = \ -D_CRT_SECURE_NO_DEPRECATE \ -D_CRT_NONSTDC_NO_DEPRECATE \ - -DUNALIGNED_OK \ -D_ARM_WINAPI_PARTITION_DESKTOP_SDK_AVAILABLE=1 \ -DARM_FEATURES \ # diff --git a/win32/Makefile.msc b/win32/Makefile.msc index ef3f237ffc..5420a463e7 100644 --- a/win32/Makefile.msc +++ b/win32/Makefile.msc @@ -35,8 +35,6 @@ WFLAGS = \ -DX86_AVX2 \ -DX86_AVX_CHUNKSET \ -DX86_SSE2_CHUNKSET \ - -DUNALIGNED_OK \ - -DUNALIGNED64_OK \ # LDFLAGS = -nologo -debug -incremental:no -opt:ref -manifest ARFLAGS = -nologo diff --git a/zutil.h b/zutil.h index f57a322203..3c06643c7a 100644 --- a/zutil.h +++ b/zutil.h @@ -259,19 +259,52 @@ void Z_INTERNAL zng_cfree(void *opaque, void *ptr); # define ALIGNED_(x) __declspec(align(x)) #endif -#ifdef FORCE_UNALIGNED -# define zmemcpy_2(dest, src) *((uint16_t *)dest) = *((uint16_t *)src) -# define zmemcpy_4(dest, src) *((uint32_t *)dest) = *((uint32_t *)src) +#if defined(__i386__) || defined(__i486__) || defined(__i586__) || defined(__i686__) || defined(_M_IX86) +# define UNALIGNED_OK +#elif defined(__x86_64__) || defined(_M_X64) || defined(_X86_) || defined(__amd64__) || defined(_M_AMD64) +# define UNALIGNED_OK +# define UNALIGNED64_OK +#elif defined(__arm__) || (_M_ARM >= 6) +# if (defined(__GNUC__) && defined(__ARM_FEATURE_UNALIGNED)) || !defined(__GNUC__) +# define UNALIGNED_OK +# endif +#elif defined(__aarch64__) || defined(_M_ARM64) +# if (defined(__GNUC__) && defined(__ARM_FEATURE_UNALIGNED)) || !defined(__GNUC__) +# define UNALIGNED_OK +# define UNALIGNED64_OK +# endif +#elif defined(__powerpc64__) || defined(__ppc64__) +# if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +# define UNALIGNED_OK +# define UNALIGNED64_OK +# endif +#endif + +/* Force compiler to emit unaligned memory accesses if unaligned access is supported + on the architecture, otherwise don't assume unaligned access is supported. This + is for old compilers that don't optimize memcpy and memcmp calls to unaligned + access instructions when it is supported on the architecture. */ +#if defined(UNALIGNED_OK) +# define zmemcmp_2(str1, str2) *((uint16_t *)(str1)) != *((uint16_t *)(str2)) +# define zmemcpy_2(dest, src) *((uint16_t *)(dest)) = *((uint16_t *)(src)) +# define zmemcmp_4(str1, str2) *((uint32_t *)(str1)) != *((uint32_t *)(str2)) +# define zmemcpy_4(dest, src) *((uint32_t *)(dest)) = *((uint32_t *)(src)) # if UINTPTR_MAX == UINT64_MAX -# define zmemcpy_8(dest, src) *((uint64_t *)dest) = *((uint64_t *)src) +# define zmemcmp_8(str1, str2) *((uint64_t *)(str1)) != *((uint64_t *)(str2)) +# define zmemcpy_8(dest, src) *((uint64_t *)(dest)) = *((uint64_t *)(src)) # else -# define zmemcpy_8(dest, src) ((uint32_t *)dest)[0] = *((uint32_t *)src)[0] \ - ((uint32_t *)dest)[1] = *((uint32_t *)src)[1] +# define zmemcmp_8(str1, str2) ((uint32_t *)(str1))[0] != *((uint32_t *)(str2))[0] || \ + ((uint32_t *)(str1))[1] != *((uint32_t *)(str2))[1] +# define zmemcpy_8(dest, src) ((uint32_t *)(dest))[0] = *((uint32_t *)(src))[0] \ + ((uint32_t *)(dest))[1] = *((uint32_t *)(src))[1] # endif #else -# define zmemcpy_2(dest, src) memcpy(dest, src, 2) -# define zmemcpy_4(dest, src) memcpy(dest, src, 4) -# define zmemcpy_8(dest, src) memcpy(dest, src, 8) +# define zmemcmp_2(str1, str2) memcmp(str1, str2, 2) +# define zmemcpy_2(dest, src) memcpy(dest, src, 2) +# define zmemcmp_4(str1, str2) memcmp(str1, str2, 4) +# define zmemcpy_4(dest, src) memcpy(dest, src, 4) +# define zmemcmp_8(str1, str2) memcmp(str1, str2, 8) +# define zmemcpy_8(dest, src) memcpy(dest, src, 8) #endif #if defined(X86_FEATURES)