From f295cca8a6f548bddf7760b458855abb38535c5e Mon Sep 17 00:00:00 2001 From: Cameron Cawley Date: Thu, 27 Jul 2023 21:07:29 +0100 Subject: [PATCH] Use GCC's may_alias attribute for unaligned memory access --- arch/arm/chunkset_neon.c | 13 ++--- arch/arm/compare256_neon.c | 2 +- arch/generic/Makefile.in | 4 +- arch/generic/chunkset_c.c | 12 ++--- arch/generic/compare256_c.c | 10 ++-- arch/power/chunkset_power8.c | 13 ++--- arch/power/compare256_power9.c | 2 +- arch/riscv/compare256_rvv.c | 2 +- arch/x86/chunkset_avx2.c | 13 ++--- arch/x86/chunkset_sse2.c | 13 ++--- arch/x86/chunkset_ssse3.c | 13 ++--- arch/x86/compare256_avx2.c | 2 +- arch/x86/compare256_sse2.c | 2 +- compare256_rle.h | 25 ++++----- deflate.h | 11 ++-- deflate_quick.c | 2 +- inflate_p.h | 9 ++-- insert_string_tpl.h | 4 +- match_tpl.h | 35 ++++++------ win32/Makefile.a64 | 4 +- win32/Makefile.arm | 4 +- win32/Makefile.msc | 8 +-- zmemory.h | 99 ++++++++++++++++++++++++++++++++++ zutil_p.h | 29 ---------- 24 files changed, 192 insertions(+), 139 deletions(-) create mode 100644 zmemory.h diff --git a/arch/arm/chunkset_neon.c b/arch/arm/chunkset_neon.c index 1c49ef5612..5daedd7ea7 100644 --- a/arch/arm/chunkset_neon.c +++ b/arch/arm/chunkset_neon.c @@ -5,6 +5,7 @@ #ifdef ARM_NEON #include "neon_intrins.h" #include "zbuild.h" +#include "zmemory.h" #include "arch/generic/chunk_permute_table.h" typedef uint8x16_t chunk_t; @@ -33,21 +34,15 @@ static const lut_rem_pair perm_idx_lut[13] = { }; static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) { - uint16_t tmp; - memcpy(&tmp, from, sizeof(tmp)); - *chunk = vreinterpretq_u8_u16(vdupq_n_u16(tmp)); + *chunk = vreinterpretq_u8_u16(vdupq_n_u16(zng_memread_2(from))); } static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) { - uint32_t tmp; - memcpy(&tmp, from, sizeof(tmp)); - *chunk = vreinterpretq_u8_u32(vdupq_n_u32(tmp)); + *chunk = vreinterpretq_u8_u32(vdupq_n_u32(zng_memread_4(from))); } static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) { - uint64_t tmp; - memcpy(&tmp, from, sizeof(tmp)); - *chunk = vreinterpretq_u8_u64(vdupq_n_u64(tmp)); + *chunk = vreinterpretq_u8_u64(vdupq_n_u64(zng_memread_8(from))); } #define CHUNKSIZE chunksize_neon diff --git a/arch/arm/compare256_neon.c b/arch/arm/compare256_neon.c index 87d14c89c0..3d05152f34 100644 --- a/arch/arm/compare256_neon.c +++ b/arch/arm/compare256_neon.c @@ -4,7 +4,7 @@ */ #include "zbuild.h" -#include "zutil_p.h" +#include "zmemory.h" #include "deflate.h" #include "fallback_builtins.h" diff --git a/arch/generic/Makefile.in b/arch/generic/Makefile.in index 32c8242d02..15d51d3135 100644 --- a/arch/generic/Makefile.in +++ b/arch/generic/Makefile.in @@ -40,10 +40,10 @@ chunkset_c.o: $(SRCDIR)/chunkset_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/chunkset_tpl. chunkset_c.lo: $(SRCDIR)/chunkset_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/chunkset_tpl.h $(SRCTOP)/inffast_tpl.h $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_c.c -compare256_c.o: $(SRCDIR)/compare256_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/zutil_p.h $(SRCTOP)/deflate.h $(SRCTOP)/fallback_builtins.h +compare256_c.o: $(SRCDIR)/compare256_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/zmemory.h $(SRCTOP)/deflate.h $(SRCTOP)/fallback_builtins.h $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_c.c -compare256_c.lo: $(SRCDIR)/compare256_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/zutil_p.h $(SRCTOP)/deflate.h $(SRCTOP)/fallback_builtins.h +compare256_c.lo: $(SRCDIR)/compare256_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/zmemory.h $(SRCTOP)/deflate.h $(SRCTOP)/fallback_builtins.h $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_c.c crc32_braid_c.o: $(SRCDIR)/crc32_braid_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/crc32_braid_p.h $(SRCTOP)/crc32_braid_tbl.h diff --git a/arch/generic/chunkset_c.c b/arch/generic/chunkset_c.c index 7b2bb7ba36..0a585e6caa 100644 --- a/arch/generic/chunkset_c.c +++ b/arch/generic/chunkset_c.c @@ -3,6 +3,7 @@ */ #include "zbuild.h" +#include "zmemory.h" typedef uint64_t chunk_t; @@ -12,21 +13,20 @@ typedef uint64_t chunk_t; #define HAVE_CHUNKMEMSET_8 static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) { - uint8_t *dest = (uint8_t *)chunk; - memcpy(dest, from, sizeof(uint32_t)); - memcpy(dest+4, from, sizeof(uint32_t)); + uint32_t tmp = zng_memread_4(from); + *chunk = tmp | ((chunk_t)tmp << 32); } static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) { - memcpy(chunk, from, sizeof(uint64_t)); + *chunk = zng_memread_8(from); } static inline void loadchunk(uint8_t const *s, chunk_t *chunk) { - memcpy(chunk, (uint8_t *)s, sizeof(uint64_t)); + *chunk = zng_memread_8(s); } static inline void storechunk(uint8_t *out, chunk_t *chunk) { - memcpy(out, chunk, sizeof(uint64_t)); + zng_memwrite_8(out, *chunk); } #define CHUNKSIZE chunksize_c diff --git a/arch/generic/compare256_c.c b/arch/generic/compare256_c.c index 0c12cb3a4e..f64542118b 100644 --- a/arch/generic/compare256_c.c +++ b/arch/generic/compare256_c.c @@ -4,7 +4,7 @@ */ #include "zbuild.h" -#include "zutil_p.h" +#include "zmemory.h" #include "deflate.h" #include "fallback_builtins.h" @@ -106,8 +106,8 @@ static inline uint32_t compare256_unaligned_32_static(const uint8_t *src0, const do { uint32_t sv, mv, diff; - memcpy(&sv, src0, sizeof(sv)); - memcpy(&mv, src1, sizeof(mv)); + sv = zng_memread_4(src0); + mv = zng_memread_4(src1); diff = sv ^ mv; if (diff) { @@ -146,8 +146,8 @@ static inline uint32_t compare256_unaligned_64_static(const uint8_t *src0, const do { uint64_t sv, mv, diff; - memcpy(&sv, src0, sizeof(sv)); - memcpy(&mv, src1, sizeof(mv)); + sv = zng_memread_8(src0); + mv = zng_memread_8(src1); diff = sv ^ mv; if (diff) { diff --git a/arch/power/chunkset_power8.c b/arch/power/chunkset_power8.c index aef1973273..673fe0e112 100644 --- a/arch/power/chunkset_power8.c +++ b/arch/power/chunkset_power8.c @@ -5,6 +5,7 @@ #ifdef POWER8_VSX #include #include "zbuild.h" +#include "zmemory.h" typedef vector unsigned char chunk_t; @@ -15,21 +16,15 @@ typedef vector unsigned char chunk_t; #define HAVE_CHUNKMEMSET_8 static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) { - uint16_t tmp; - memcpy(&tmp, from, sizeof(tmp)); - *chunk = (vector unsigned char)vec_splats(tmp); + *chunk = (vector unsigned char)vec_splats(zng_memread_2(from)); } static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) { - uint32_t tmp; - memcpy(&tmp, from, sizeof(tmp)); - *chunk = (vector unsigned char)vec_splats(tmp); + *chunk = (vector unsigned char)vec_splats(zng_memread_4(from)); } static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) { - uint64_t tmp; - memcpy(&tmp, from, sizeof(tmp)); - *chunk = (vector unsigned char)vec_splats((unsigned long long)tmp); + *chunk = (vector unsigned char)vec_splats((unsigned long long)zng_memread_8(from)); } static inline void loadchunk(uint8_t const *s, chunk_t *chunk) { diff --git a/arch/power/compare256_power9.c b/arch/power/compare256_power9.c index c8be498e4f..2875719c47 100644 --- a/arch/power/compare256_power9.c +++ b/arch/power/compare256_power9.c @@ -6,7 +6,7 @@ #ifdef POWER9 #include #include "zbuild.h" -#include "zutil_p.h" +#include "zmemory.h" #include "deflate.h" #include "zendian.h" diff --git a/arch/riscv/compare256_rvv.c b/arch/riscv/compare256_rvv.c index 3d6c3e3aa5..3ddb4db080 100644 --- a/arch/riscv/compare256_rvv.c +++ b/arch/riscv/compare256_rvv.c @@ -7,7 +7,7 @@ #ifdef RISCV_RVV #include "zbuild.h" -#include "zutil_p.h" +#include "zmemory.h" #include "deflate.h" #include "fallback_builtins.h" diff --git a/arch/x86/chunkset_avx2.c b/arch/x86/chunkset_avx2.c index 70620b9154..96c1176ee6 100644 --- a/arch/x86/chunkset_avx2.c +++ b/arch/x86/chunkset_avx2.c @@ -2,6 +2,7 @@ * For conditions of distribution and use, see copyright notice in zlib.h */ #include "zbuild.h" +#include "zmemory.h" #ifdef X86_AVX2 #include @@ -51,21 +52,15 @@ static const lut_rem_pair perm_idx_lut[29] = { }; static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) { - int16_t tmp; - memcpy(&tmp, from, sizeof(tmp)); - *chunk = _mm256_set1_epi16(tmp); + *chunk = _mm256_set1_epi16(zng_memread_2(from)); } static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) { - int32_t tmp; - memcpy(&tmp, from, sizeof(tmp)); - *chunk = _mm256_set1_epi32(tmp); + *chunk = _mm256_set1_epi32(zng_memread_4(from)); } static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) { - int64_t tmp; - memcpy(&tmp, from, sizeof(tmp)); - *chunk = _mm256_set1_epi64x(tmp); + *chunk = _mm256_set1_epi64x(zng_memread_8(from)); } static inline void loadchunk(uint8_t const *s, chunk_t *chunk) { diff --git a/arch/x86/chunkset_sse2.c b/arch/x86/chunkset_sse2.c index c402c0ee18..ef09993ca6 100644 --- a/arch/x86/chunkset_sse2.c +++ b/arch/x86/chunkset_sse2.c @@ -3,6 +3,7 @@ */ #include "zbuild.h" +#include "zmemory.h" #ifdef X86_SSE2 #include @@ -16,21 +17,15 @@ typedef __m128i chunk_t; #define HAVE_CHUNKMEMSET_8 static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) { - int16_t tmp; - memcpy(&tmp, from, sizeof(tmp)); - *chunk = _mm_set1_epi16(tmp); + *chunk = _mm_set1_epi16(zng_memread_2(from)); } static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) { - int32_t tmp; - memcpy(&tmp, from, sizeof(tmp)); - *chunk = _mm_set1_epi32(tmp); + *chunk = _mm_set1_epi32(zng_memread_4(from)); } static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) { - int64_t tmp; - memcpy(&tmp, from, sizeof(tmp)); - *chunk = _mm_set1_epi64x(tmp); + *chunk = _mm_set1_epi64x(zng_memread_8(from)); } static inline void loadchunk(uint8_t const *s, chunk_t *chunk) { diff --git a/arch/x86/chunkset_ssse3.c b/arch/x86/chunkset_ssse3.c index 722ecd3d51..cddf525392 100644 --- a/arch/x86/chunkset_ssse3.c +++ b/arch/x86/chunkset_ssse3.c @@ -3,6 +3,7 @@ */ #include "zbuild.h" +#include "zmemory.h" #if defined(X86_SSSE3) #include @@ -35,21 +36,15 @@ static const lut_rem_pair perm_idx_lut[13] = { static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) { - int16_t tmp; - memcpy(&tmp, from, sizeof(tmp)); - *chunk = _mm_set1_epi16(tmp); + *chunk = _mm_set1_epi16(zng_memread_2(from)); } static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) { - int32_t tmp; - memcpy(&tmp, from, sizeof(tmp)); - *chunk = _mm_set1_epi32(tmp); + *chunk = _mm_set1_epi32(zng_memread_4(from)); } static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) { - int64_t tmp; - memcpy(&tmp, from, sizeof(tmp)); - *chunk = _mm_set1_epi64x(tmp); + *chunk = _mm_set1_epi64x(zng_memread_8(from)); } static inline void loadchunk(uint8_t const *s, chunk_t *chunk) { diff --git a/arch/x86/compare256_avx2.c b/arch/x86/compare256_avx2.c index d2c835e4ee..8a0213c3a6 100644 --- a/arch/x86/compare256_avx2.c +++ b/arch/x86/compare256_avx2.c @@ -4,7 +4,7 @@ */ #include "zbuild.h" -#include "zutil_p.h" +#include "zmemory.h" #include "deflate.h" #include "fallback_builtins.h" diff --git a/arch/x86/compare256_sse2.c b/arch/x86/compare256_sse2.c index 216bb3a705..25b65316a8 100644 --- a/arch/x86/compare256_sse2.c +++ b/arch/x86/compare256_sse2.c @@ -4,7 +4,7 @@ */ #include "zbuild.h" -#include "zutil_p.h" +#include "zmemory.h" #include "deflate.h" #include "fallback_builtins.h" diff --git a/compare256_rle.h b/compare256_rle.h index 0f3998d4a3..7b272bc7d2 100644 --- a/compare256_rle.h +++ b/compare256_rle.h @@ -4,6 +4,7 @@ */ #include "zbuild.h" +#include "zmemory.h" #include "fallback_builtins.h" typedef uint32_t (*compare256_rle_func)(const uint8_t* src0, const uint8_t* src1); @@ -46,25 +47,21 @@ static inline uint32_t compare256_rle_c(const uint8_t *src0, const uint8_t *src1 /* 16-bit unaligned integer comparison */ static inline uint32_t compare256_rle_unaligned_16(const uint8_t *src0, const uint8_t *src1) { uint32_t len = 0; - uint16_t src0_cmp, src1_cmp; + uint16_t src0_cmp; - memcpy(&src0_cmp, src0, sizeof(src0_cmp)); + src0_cmp = zng_memread_2(src0); do { - memcpy(&src1_cmp, src1, sizeof(src1_cmp)); - if (src0_cmp != src1_cmp) + if (src0_cmp != zng_memread_2(src1)) return len + (*src0 == *src1); src1 += 2, len += 2; - memcpy(&src1_cmp, src1, sizeof(src1_cmp)); - if (src0_cmp != src1_cmp) + if (src0_cmp != zng_memread_2(src1)) return len + (*src0 == *src1); src1 += 2, len += 2; - memcpy(&src1_cmp, src1, sizeof(src1_cmp)); - if (src0_cmp != src1_cmp) + if (src0_cmp != zng_memread_2(src1)) return len + (*src0 == *src1); src1 += 2, len += 2; - memcpy(&src1_cmp, src1, sizeof(src1_cmp)); - if (src0_cmp != src1_cmp) + if (src0_cmp != zng_memread_2(src1)) return len + (*src0 == *src1); src1 += 2, len += 2; } while (len < 256); @@ -78,13 +75,13 @@ static inline uint32_t compare256_rle_unaligned_32(const uint8_t *src0, const ui uint32_t sv, len = 0; uint16_t src0_cmp; - memcpy(&src0_cmp, src0, sizeof(src0_cmp)); + src0_cmp = zng_memread_2(src0); sv = ((uint32_t)src0_cmp << 16) | src0_cmp; do { uint32_t mv, diff; - memcpy(&mv, src1, sizeof(mv)); + mv = zng_memread_4(src1); diff = sv ^ mv; if (diff) { @@ -107,14 +104,14 @@ static inline uint32_t compare256_rle_unaligned_64(const uint8_t *src0, const ui uint16_t src0_cmp; uint64_t sv; - memcpy(&src0_cmp, src0, sizeof(src0_cmp)); + src0_cmp = zng_memread_2(src0); src0_cmp32 = ((uint32_t)src0_cmp << 16) | src0_cmp; sv = ((uint64_t)src0_cmp32 << 32) | src0_cmp32; do { uint64_t mv, diff; - memcpy(&mv, src1, sizeof(mv)); + mv = zng_memread_8(src1); diff = sv ^ mv; if (diff) { diff --git a/deflate.h b/deflate.h index a6492031e1..c80fc150dc 100644 --- a/deflate.h +++ b/deflate.h @@ -12,6 +12,7 @@ #include "zutil.h" #include "zendian.h" +#include "zmemory.h" #include "crc32.h" /* define NO_GZIP when compiling if you want to disable gzip header and @@ -329,7 +330,7 @@ static inline void put_short(deflate_state *s, uint16_t w) { #if BYTE_ORDER == BIG_ENDIAN w = ZSWAP16(w); #endif - memcpy(&s->pending_buf[s->pending], &w, sizeof(w)); + zng_memwrite_2(&s->pending_buf[s->pending], w); s->pending += 2; } @@ -341,7 +342,7 @@ static inline void put_short_msb(deflate_state *s, uint16_t w) { #if BYTE_ORDER == LITTLE_ENDIAN w = ZSWAP16(w); #endif - memcpy(&s->pending_buf[s->pending], &w, sizeof(w)); + zng_memwrite_2(&s->pending_buf[s->pending], w); s->pending += 2; } @@ -353,7 +354,7 @@ static inline void put_uint32(deflate_state *s, uint32_t dw) { #if BYTE_ORDER == BIG_ENDIAN dw = ZSWAP32(dw); #endif - memcpy(&s->pending_buf[s->pending], &dw, sizeof(dw)); + zng_memwrite_4(&s->pending_buf[s->pending], dw); s->pending += 4; } @@ -365,7 +366,7 @@ static inline void put_uint32_msb(deflate_state *s, uint32_t dw) { #if BYTE_ORDER == LITTLE_ENDIAN dw = ZSWAP32(dw); #endif - memcpy(&s->pending_buf[s->pending], &dw, sizeof(dw)); + zng_memwrite_4(&s->pending_buf[s->pending], dw); s->pending += 4; } @@ -377,7 +378,7 @@ static inline void put_uint64(deflate_state *s, uint64_t lld) { #if BYTE_ORDER == BIG_ENDIAN lld = ZSWAP64(lld); #endif - memcpy(&s->pending_buf[s->pending], &lld, sizeof(lld)); + zng_memwrite_8(&s->pending_buf[s->pending], lld); s->pending += 8; } diff --git a/deflate_quick.c b/deflate_quick.c index b72bd12900..a585e7b22d 100644 --- a/deflate_quick.c +++ b/deflate_quick.c @@ -18,7 +18,7 @@ */ #include "zbuild.h" -#include "zutil_p.h" +#include "zmemory.h" #include "deflate.h" #include "deflate_p.h" #include "functable.h" diff --git a/inflate_p.h b/inflate_p.h index eff73876da..2463d4d897 100644 --- a/inflate_p.h +++ b/inflate_p.h @@ -6,6 +6,7 @@ #define INFLATE_P_H #include +#include "zmemory.h" /* Architecture-specific hooks. */ #ifdef S390_DFLTCC_INFLATE @@ -137,8 +138,7 @@ /* Load 64 bits from IN and place the bytes at offset BITS in the result. */ static inline uint64_t load_64_bits(const unsigned char *in, unsigned bits) { - uint64_t chunk; - memcpy(&chunk, in, sizeof(chunk)); + uint64_t chunk = zng_memread_8(in); #if BYTE_ORDER == LITTLE_ENDIAN return chunk << bits; @@ -179,7 +179,10 @@ static inline uint8_t* chunkcopy_safe(uint8_t *out, uint8_t *from, uint64_t len, len -= non_olap_size; /* So this doesn't give use a worst case scenario of function calls in a loop, - * we want to instead break this down into copy blocks of fixed lengths */ + * we want to instead break this down into copy blocks of fixed lengths + * + * TODO: The memcpy calls aren't inlined on architectures with strict memory alignment + */ while (len) { tocopy = MIN(non_olap_size, len); len -= tocopy; diff --git a/insert_string_tpl.h b/insert_string_tpl.h index 281c013463..989f6d53aa 100644 --- a/insert_string_tpl.h +++ b/insert_string_tpl.h @@ -22,6 +22,8 @@ * */ +#include "zmemory.h" + #ifndef HASH_CALC_OFFSET # define HASH_CALC_OFFSET 0 #endif @@ -31,7 +33,7 @@ #ifndef HASH_CALC_READ # if BYTE_ORDER == LITTLE_ENDIAN # define HASH_CALC_READ \ - memcpy(&val, strstart, sizeof(val)); + val = zng_memread_4(strstart); # else # define HASH_CALC_READ \ val = ((uint32_t)(strstart[0])); \ diff --git a/match_tpl.h b/match_tpl.h index 9c258242cd..81db70d53d 100644 --- a/match_tpl.h +++ b/match_tpl.h @@ -40,10 +40,15 @@ Z_INTERNAL uint32_t LONGEST_MATCH(deflate_state *const s, Pos cur_match) { uint32_t chain_length, nice_match, best_len, offset; uint32_t lookahead = s->lookahead; Pos match_offset = 0; -#ifdef UNALIGNED_OK - uint8_t scan_start[8]; -#endif +#ifdef UNALIGNED64_OK + uint64_t scan_start; + uint64_t scan_end; +#elif defined(UNALIGNED_OK) + uint32_t scan_start; + uint32_t scan_end; +#else uint8_t scan_end[8]; +#endif #define GOTO_NEXT_CHAIN \ if (--chain_length && (cur_match = prev[cur_match & wmask]) > limit) \ @@ -70,11 +75,11 @@ Z_INTERNAL uint32_t LONGEST_MATCH(deflate_state *const s, Pos cur_match) { #endif #ifdef UNALIGNED64_OK - memcpy(scan_start, scan, sizeof(uint64_t)); - memcpy(scan_end, scan+offset, sizeof(uint64_t)); + scan_start = zng_memread_8(scan); + scan_end = zng_memread_8(scan+offset); #elif defined(UNALIGNED_OK) - memcpy(scan_start, scan, sizeof(uint32_t)); - memcpy(scan_end, scan+offset, sizeof(uint32_t)); + scan_start = zng_memread_4(scan); + scan_end = zng_memread_4(scan+offset); #else scan_end[0] = *(scan+offset); scan_end[1] = *(scan+offset+1); @@ -141,24 +146,24 @@ Z_INTERNAL uint32_t LONGEST_MATCH(deflate_state *const s, Pos cur_match) { #ifdef UNALIGNED_OK if (best_len < sizeof(uint32_t)) { for (;;) { - if (zng_memcmp_2(mbase_end+cur_match, scan_end) == 0 && - zng_memcmp_2(mbase_start+cur_match, scan_start) == 0) + if (zng_memcmp_2(mbase_end+cur_match, &scan_end) == 0 && + zng_memcmp_2(mbase_start+cur_match, &scan_start) == 0) break; GOTO_NEXT_CHAIN; } # ifdef UNALIGNED64_OK } else if (best_len >= sizeof(uint64_t)) { for (;;) { - if (zng_memcmp_8(mbase_end+cur_match, scan_end) == 0 && - zng_memcmp_8(mbase_start+cur_match, scan_start) == 0) + if (zng_memcmp_8(mbase_end+cur_match, &scan_end) == 0 && + zng_memcmp_8(mbase_start+cur_match, &scan_start) == 0) break; GOTO_NEXT_CHAIN; } # endif } else { for (;;) { - if (zng_memcmp_4(mbase_end+cur_match, scan_end) == 0 && - zng_memcmp_4(mbase_start+cur_match, scan_start) == 0) + if (zng_memcmp_4(mbase_end+cur_match, &scan_end) == 0 && + zng_memcmp_4(mbase_start+cur_match, &scan_start) == 0) break; GOTO_NEXT_CHAIN; } @@ -197,9 +202,9 @@ Z_INTERNAL uint32_t LONGEST_MATCH(deflate_state *const s, Pos cur_match) { #endif #ifdef UNALIGNED64_OK - memcpy(scan_end, scan+offset, sizeof(uint64_t)); + scan_end = zng_memread_8(scan+offset); #elif defined(UNALIGNED_OK) - memcpy(scan_end, scan+offset, sizeof(uint32_t)); + scan_end = zng_memread_4(scan+offset); #else scan_end[0] = *(scan+offset); scan_end[1] = *(scan+offset+1); diff --git a/win32/Makefile.a64 b/win32/Makefile.a64 index 9f8d6fb7fa..3209f6a305 100644 --- a/win32/Makefile.a64 +++ b/win32/Makefile.a64 @@ -183,7 +183,7 @@ adler32.obj: $(TOP)/adler32.c $(TOP)/zbuild.h $(TOP)/functable.h $(TOP)/adler32_ adler32_c.obj: $(TOP)/arch/generic/adler32_c.c $(TOP)/zbuild.h $(TOP)/functable.h $(TOP)/adler32_p.h adler32_fold_c.obj: $(TOP)/arch/generic/adler32_fold_c.c $(TOP)/zbuild.h $(TOP)/functable.h chunkset_c.obj: $(TOP)/arch/generic/chunkset_c.c $(TOP)/zbuild.h $(TOP)/chunkset_tpl.h $(TOP)/inffast_tpl.h -compare256_c.obj: $(TOP)/arch/generic/compare256_c.c $(TOP)/zbuild.h $(TOP)/zutil_p.h $(TOP)/deflate.h $(TOP)/fallback_builtins.h $(TOP)/match_tpl.h +compare256_c.obj: $(TOP)/arch/generic/compare256_c.c $(TOP)/zbuild.h $(TOP)/zmemory.h $(TOP)/deflate.h $(TOP)/fallback_builtins.h $(TOP)/match_tpl.h compress.obj: $(TOP)/compress.c $(TOP)/zbuild.h $(TOP)/zutil.h cpu_features.obj: $(TOP)/cpu_features.c $(TOP)/cpu_features.h $(TOP)/zbuild.h crc32.obj: $(TOP)/crc32.c $(TOP)/zbuild.h $(TOP)/functable.h $(TOP)/crc32_braid_tbl.h @@ -194,7 +194,7 @@ deflate.obj: $(TOP)/deflate.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p. deflate_fast.obj: $(TOP)/deflate_fast.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p.h $(TOP)/functable.h deflate_huff.obj: $(TOP)/deflate_huff.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p.h $(TOP)/functable.h deflate_medium.obj: $(TOP)/deflate_medium.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p.h $(TOP)/functable.h -deflate_quick.obj: $(TOP)/deflate_quick.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p.h $(TOP)/functable.h $(TOP)/trees_emit.h $(TOP)/zutil_p.h +deflate_quick.obj: $(TOP)/deflate_quick.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p.h $(TOP)/functable.h $(TOP)/trees_emit.h $(TOP)/zmemory.h deflate_rle.obj: $(TOP)/deflate_rle.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p.h $(TOP)/functable.h $(TOP)/compare256_rle.h deflate_slow.obj: $(TOP)/deflate_slow.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p.h $(TOP)/functable.h deflate_stored.obj: $(TOP)/deflate_stored.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p.h $(TOP)/functable.h diff --git a/win32/Makefile.arm b/win32/Makefile.arm index cab999dfe0..54da045ffd 100644 --- a/win32/Makefile.arm +++ b/win32/Makefile.arm @@ -204,7 +204,7 @@ adler32.obj: $(TOP)/adler32.c $(TOP)/zbuild.h $(TOP)/functable.h $(TOP)/adler32_ adler32_c.obj: $(TOP)/arch/generic/adler32_c.c $(TOP)/zbuild.h $(TOP)/functable.h $(TOP)/adler32_p.h adler32_fold_c.obj: $(TOP)/arch/generic/adler32_fold_c.c $(TOP)/zbuild.h $(TOP)/functable.h chunkset_c.obj: $(TOP)/arch/generic/chunkset_c.c $(TOP)/zbuild.h $(TOP)/chunkset_tpl.h $(TOP)/inffast_tpl.h -compare256_c.obj: $(TOP)/arch/generic/compare256_c.c $(TOP)/zbuild.h $(TOP)/zutil_p.h $(TOP)/deflate.h $(TOP)/fallback_builtins.h $(TOP)/match_tpl.h +compare256_c.obj: $(TOP)/arch/generic/compare256_c.c $(TOP)/zbuild.h $(TOP)/zmemory.h $(TOP)/deflate.h $(TOP)/fallback_builtins.h $(TOP)/match_tpl.h compress.obj: $(TOP)/compress.c $(TOP)/zbuild.h $(TOP)/zutil.h cpu_features.obj: $(TOP)/cpu_features.c $(TOP)/cpu_features.h $(TOP)/zbuild.h crc32.obj: $(TOP)/crc32.c $(TOP)/zbuild.h $(TOP)/functable.h $(TOP)/crc32_braid_tbl.h @@ -215,7 +215,7 @@ deflate.obj: $(TOP)/deflate.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p. deflate_fast.obj: $(TOP)/deflate_fast.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p.h $(TOP)/functable.h deflate_huff.obj: $(TOP)/deflate_huff.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p.h $(TOP)/functable.h deflate_medium.obj: $(TOP)/deflate_medium.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p.h $(TOP)/functable.h -deflate_quick.obj: $(TOP)/deflate_quick.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p.h $(TOP)/functable.h $(TOP)/trees_emit.h $(TOP)/zutil_p.h +deflate_quick.obj: $(TOP)/deflate_quick.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p.h $(TOP)/functable.h $(TOP)/trees_emit.h $(TOP)/zmemory.h deflate_rle.obj: $(TOP)/deflate_rle.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p.h $(TOP)/functable.h $(TOP)/compare256_rle.h deflate_slow.obj: $(TOP)/deflate_slow.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p.h $(TOP)/functable.h deflate_stored.obj: $(TOP)/deflate_stored.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p.h $(TOP)/functable.h diff --git a/win32/Makefile.msc b/win32/Makefile.msc index 8392fe46e7..62ca621aef 100644 --- a/win32/Makefile.msc +++ b/win32/Makefile.msc @@ -212,9 +212,9 @@ chunkset_c.obj: $(TOP)/arch/generic/chunkset_c.c $(TOP)/zbuild.h $(TOP)/chunkset chunkset_avx2.obj: $(TOP)/arch/x86/chunkset_avx2.c $(TOP)/zbuild.h $(TOP)/chunkset_tpl.h $(TOP)/inffast_tpl.h $(TOP)/arch/generic/chunk_permute_table.h chunkset_sse2.obj: $(TOP)/arch/x86/chunkset_sse2.c $(TOP)/zbuild.h $(TOP)/chunkset_tpl.h $(TOP)/inffast_tpl.h chunkset_ssse3.obj: $(TOP)/arch/x86/chunkset_ssse3.c $(TOP)/zbuild.h $(TOP)/chunkset_tpl.h $(TOP)/inffast_tpl.h $(TOP)/arch/generic/chunk_permute_table.h -compare256_c.obj: $(TOP)/arch/generic/compare256_c.c $(TOP)/zbuild.h $(TOP)/zutil_p.h $(TOP)/deflate.h $(TOP)/fallback_builtins.h $(TOP)/match_tpl.h -compare256_avx2.obj: $(TOP)/arch/x86/compare256_avx2.c $(TOP)/zbuild.h $(TOP)/zutil_p.h $(TOP)/deflate.h $(TOP)/fallback_builtins.h $(TOP)/match_tpl.h -compare256_sse2.obj: $(TOP)/arch/x86/compare256_sse2.c $(TOP)/zbuild.h $(TOP)/zutil_p.h $(TOP)/deflate.h $(TOP)/fallback_builtins.h $(TOP)/match_tpl.h +compare256_c.obj: $(TOP)/arch/generic/compare256_c.c $(TOP)/zbuild.h $(TOP)/zmemory.h $(TOP)/deflate.h $(TOP)/fallback_builtins.h $(TOP)/match_tpl.h +compare256_avx2.obj: $(TOP)/arch/x86/compare256_avx2.c $(TOP)/zbuild.h $(TOP)/zmemory.h $(TOP)/deflate.h $(TOP)/fallback_builtins.h $(TOP)/match_tpl.h +compare256_sse2.obj: $(TOP)/arch/x86/compare256_sse2.c $(TOP)/zbuild.h $(TOP)/zmemory.h $(TOP)/deflate.h $(TOP)/fallback_builtins.h $(TOP)/match_tpl.h compress.obj: $(TOP)/compress.c $(TOP)/zbuild.h $(TOP)/zutil.h cpu_features.obj: $(TOP)/cpu_features.c $(TOP)/cpu_features.h $(TOP)/zbuild.h crc32.obj: $(TOP)/crc32.c $(TOP)/zbuild.h $(TOP)/functable.h $(TOP)/crc32_braid_tbl.h @@ -226,7 +226,7 @@ deflate.obj: $(TOP)/deflate.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p. deflate_fast.obj: $(TOP)/deflate_fast.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p.h $(TOP)/functable.h deflate_huff.obj: $(TOP)/deflate_huff.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p.h $(TOP)/functable.h deflate_medium.obj: $(TOP)/deflate_medium.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p.h $(TOP)/functable.h -deflate_quick.obj: $(TOP)/deflate_quick.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p.h $(TOP)/functable.h $(TOP)/trees_emit.h $(TOP)/zutil_p.h +deflate_quick.obj: $(TOP)/deflate_quick.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p.h $(TOP)/functable.h $(TOP)/trees_emit.h $(TOP)/zmemory.h deflate_rle.obj: $(TOP)/deflate_rle.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p.h $(TOP)/functable.h $(TOP)/compare256_rle.h deflate_slow.obj: $(TOP)/deflate_slow.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p.h $(TOP)/functable.h deflate_stored.obj: $(TOP)/deflate_stored.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p.h $(TOP)/functable.h diff --git a/zmemory.h b/zmemory.h new file mode 100644 index 0000000000..996aec6073 --- /dev/null +++ b/zmemory.h @@ -0,0 +1,99 @@ +/* zmemory.h -- Private inline functions used internally in zlib-ng + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifndef _ZMEMORY_H +#define _ZMEMORY_H + +#if defined(__GNUC__) && (__GNUC__ >= 4) +# define HAVE_MAY_ALIAS +#endif + +static inline uint16_t zng_memread_2(const void *ptr) { +#if defined(HAVE_MAY_ALIAS) + typedef struct { uint16_t val; } __attribute__ ((__packed__, __may_alias__)) unaligned_uint16_t; + return ((const unaligned_uint16_t *)ptr)->val; +#else + uint16_t val; + memcpy(&val, ptr, sizeof(val)); + return val; +#endif +} + +static inline uint32_t zng_memread_4(const void *ptr) { +#if defined(HAVE_MAY_ALIAS) + typedef struct { uint32_t val; } __attribute__ ((__packed__, __may_alias__)) unaligned_uint32_t; + return ((const unaligned_uint32_t *)ptr)->val; +#else + uint32_t val; + memcpy(&val, ptr, sizeof(val)); + return val; +#endif +} + +static inline uint64_t zng_memread_8(const void *ptr) { +#if defined(HAVE_MAY_ALIAS) + typedef struct { uint64_t val; } __attribute__ ((__packed__, __may_alias__)) unaligned_uint64_t; + return ((const unaligned_uint64_t *)ptr)->val; +#else + uint64_t val; + memcpy(&val, ptr, sizeof(val)); + return val; +#endif +} + +static inline void zng_memwrite_2(void *ptr, uint16_t val) { +#if defined(HAVE_MAY_ALIAS) + typedef struct { uint16_t val; } __attribute__ ((__packed__, __may_alias__)) unaligned_uint16_t; + ((unaligned_uint16_t *)ptr)->val = val; +#else + memcpy(ptr, &val, sizeof(val)); +#endif +} + +static inline void zng_memwrite_4(void *ptr, uint32_t val) { +#if defined(HAVE_MAY_ALIAS) + typedef struct { uint32_t val; } __attribute__ ((__packed__, __may_alias__)) unaligned_uint32_t; + ((unaligned_uint32_t *)ptr)->val = val; +#else + memcpy(ptr, &val, sizeof(val)); +#endif +} + +static inline void zng_memwrite_8(void *ptr, uint64_t val) { +#if defined(HAVE_MAY_ALIAS) + typedef struct { uint64_t val; } __attribute__ ((__packed__, __may_alias__)) unaligned_uint64_t; + ((unaligned_uint64_t *)ptr)->val = val; +#else + memcpy(ptr, &val, sizeof(val)); +#endif +} + +/* Use zng_memread_* instead of memcmp to avoid older compilers not converting memcmp + calls to unaligned comparisons when unaligned access is supported. Use memcmp only when + unaligned support is not available to avoid an extra call to memcpy. */ +static inline int32_t zng_memcmp_2(const void *src0, const void *src1) { +#if defined(HAVE_MAY_ALIAS) || defined(UNALIGNED_OK) + return zng_memread_2(src0) != zng_memread_2(src1); +#else + return memcmp(src0, src1, 2); +#endif +} + +static inline int32_t zng_memcmp_4(const void *src0, const void *src1) { +#if defined(HAVE_MAY_ALIAS) || defined(UNALIGNED_OK) + return zng_memread_4(src0) != zng_memread_4(src1); +#else + return memcmp(src0, src1, 4); +#endif +} + +static inline int32_t zng_memcmp_8(const void *src0, const void *src1) { +#if defined(HAVE_MAY_ALIAS) || defined(UNALIGNED64_OK) + return zng_memread_8(src0) != zng_memread_8(src1); +#else + return memcmp(src0, src1, 8); +#endif +} + +#endif diff --git a/zutil_p.h b/zutil_p.h index 97799f0ce3..835e12f4de 100644 --- a/zutil_p.h +++ b/zutil_p.h @@ -43,33 +43,4 @@ static inline void zng_free(void *ptr) { #endif } -/* Use memcpy instead of memcmp to avoid older compilers not converting memcmp calls to - unaligned comparisons when unaligned access is supported. */ -static inline int32_t zng_memcmp_2(const void *src0, const void *src1) { - uint16_t src0_cmp, src1_cmp; - - memcpy(&src0_cmp, src0, sizeof(src0_cmp)); - memcpy(&src1_cmp, src1, sizeof(src1_cmp)); - - return src0_cmp != src1_cmp; -} - -static inline int32_t zng_memcmp_4(const void *src0, const void *src1) { - uint32_t src0_cmp, src1_cmp; - - memcpy(&src0_cmp, src0, sizeof(src0_cmp)); - memcpy(&src1_cmp, src1, sizeof(src1_cmp)); - - return src0_cmp != src1_cmp; -} - -static inline int32_t zng_memcmp_8(const void *src0, const void *src1) { - uint64_t src0_cmp, src1_cmp; - - memcpy(&src0_cmp, src0, sizeof(src0_cmp)); - memcpy(&src1_cmp, src1, sizeof(src1_cmp)); - - return src0_cmp != src1_cmp; -} - #endif