Skip to content

Commit

Permalink
remove endian-specific logic from str*cmp (twitter#177)
Browse files Browse the repository at this point in the history
* remove endian-specific logic from str*cmp

* adding test, note

* an improved generic string comparison, preserving original little endian version as comment

* axed lookup3, remove endian config, bumped version due to incompatibility

* up version in rust binding
  • Loading branch information
thinkingfish committed Oct 26, 2018
1 parent 4c0668b commit a6a54d9
Show file tree
Hide file tree
Showing 9 changed files with 59 additions and 520 deletions.
8 changes: 2 additions & 6 deletions CMakeLists.txt
Expand Up @@ -36,8 +36,8 @@ endif()
# config.h.in has to include entries set/tested here for them to have effect

# version info
set(${PROJECT_NAME}_VERSION_MAJOR 1)
set(${PROJECT_NAME}_VERSION_MINOR 2)
set(${PROJECT_NAME}_VERSION_MAJOR 2)
set(${PROJECT_NAME}_VERSION_MINOR 0)
set(${PROJECT_NAME}_VERSION_PATCH 0)
set(${PROJECT_NAME}_VERSION
${${PROJECT_NAME}_VERSION_MAJOR}.${${PROJECT_NAME}_VERSION_MINOR}.${${PROJECT_NAME}_VERSION_PATCH}
Expand Down Expand Up @@ -98,9 +98,6 @@ include(CheckFunctionExists)
check_function_exists(backtrace HAVE_BACKTRACE)
check_function_exists(accept4 HAVE_ACCEPT4)

include(TestBigEndian)
test_big_endian(HAVE_BIG_ENDIAN)

# how to use config.h.in to generate config.h
# this has to be set _after_ the above checks
configure_file(
Expand Down Expand Up @@ -190,6 +187,5 @@ message(STATUS "CFLAGS: " ${CMAKE_C_FLAGS})
message(STATUS "HAVE_SIGNAME: " ${HAVE_SIGNAME})

message(STATUS "HAVE_BACKTRACE: " ${HAVE_BACKTRACE})
message(STATUS "HAVE_BIG_ENDIAN: " ${HAVE_BIG_ENDIAN})

message(STATUS "CHECK_WORKING: " ${CHECK_WORKING})
2 changes: 0 additions & 2 deletions config.h.in
Expand Up @@ -14,8 +14,6 @@

#cmakedefine HAVE_ACCEPT4

#cmakedefine HAVE_BIG_ENDIAN

#cmakedefine HAVE_LOGGING

#cmakedefine HAVE_STATS
Expand Down
56 changes: 32 additions & 24 deletions include/cc_bstring.h
Expand Up @@ -58,78 +58,86 @@ int bstring_compare(const struct bstring *s1, const struct bstring *s2);
struct bstring *bstring_alloc(uint32_t size);
void bstring_free(struct bstring **bstring);

/* TODO(yao): is this endian thing really useful? */
/* efficient implementation of string comparion of short strings */
#define str2cmp(m, c0, c1) \
(m[0] == c0 && m[1] == c1)

#define str3cmp(m, c0, c1, c2) \
(m[0] == c0 && m[1] == c1 && m[2] == c2)

#ifdef CC_LITTLE_ENDIAN

#define str4cmp(m, c0, c1, c2, c3) \
(*(uint32_t *) m == ((c3 << 24) | (c2 << 16) | (c1 << 8) | c0))
((m[0] << 24 | m[1] << 16 | m[2] << 8 | m[3]) == \
((c0 << 24) | (c1 << 16) | (c2 << 8) | c3))

#define str5cmp(m, c0, c1, c2, c3, c4) \
(str4cmp(m, c0, c1, c2, c3) && (m[4] == c4))

#define str6cmp(m, c0, c1, c2, c3, c4, c5) \
(str4cmp(m, c0, c1, c2, c3) && \
(((uint32_t *) m)[1] & 0xffff) == ((c5 << 8) | c4))
(str5cmp(m, c0, c1, c2, c3, c4) && m[5] == c5)

#define str7cmp(m, c0, c1, c2, c3, c4, c5, c6) \
(str6cmp(m, c0, c1, c2, c3, c4, c5) && (m[6] == c6))
(str6cmp(m, c0, c1, c2, c3, c4, c5) && m[6] == c6)

#define str8cmp(m, c0, c1, c2, c3, c4, c5, c6, c7) \
(str4cmp(m, c0, c1, c2, c3) && \
(((uint32_t *) m)[1] == ((c7 << 24) | (c6 << 16) | (c5 << 8) | c4)))
(m[4] << 24 | m[5] << 16 | m[6] << 8 | m[7]) == \
((c4 << 24) | (c5 << 16) | (c6 << 8) | c7))

#define str9cmp(m, c0, c1, c2, c3, c4, c5, c6, c7, c8) \
(str8cmp(m, c0, c1, c2, c3, c4, c5, c6, c7) && m[8] == c8)

#define str10cmp(m, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9) \
(str8cmp(m, c0, c1, c2, c3, c4, c5, c6, c7) && \
(((uint32_t *) m)[2] & 0xffff) == ((c9 << 8) | c8))
(str9cmp(m, c0, c1, c2, c3, c4, c5, c6, c7, c8) && m[9] == c9)

#define str11cmp(m, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10) \
(str10cmp(m, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9) && (m[10] == c10))
(str10cmp(m, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9) && m[10] == c10)

#define str12cmp(m, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11) \
(str8cmp(m, c0, c1, c2, c3, c4, c5, c6, c7) && \
(((uint32_t *) m)[2] == ((c11 << 24) | (c10 << 16) | (c9 << 8) | c8)))

#else // BIG ENDIAN
(m[8] << 24 | m[9] << 16 | m[10] << 8 | m[11]) == \
((c8 << 24) | (c9 << 16) | (c10 << 8) | c11))

/* below is a more efficient implementation for little-endian only, it takes
* about 50% the cycles compared to the generic implementation above in the
* extreme cases (e.g. string length being multiples of 4), however, our
* profiling showed that string comparison does not contribute meaningfully to
* overall processing cost, both events and hashes are far more notable, and
* therefore we can choose the generic implementation until profiling results
* indicate otherwise.
*/

/*
#define str4cmp(m, c0, c1, c2, c3) \
(str3cmp(m, c0, c1, c2) && (m3 == c3))
(*(uint32_t *) m == ((c3 << 24) | (c2 << 16) | (c1 << 8) | c0))
#define str5cmp(m, c0, c1, c2, c3, c4) \
(str4cmp(m, c0, c1, c2, c3) && (m[4] == c4))
#define str6cmp(m, c0, c1, c2, c3, c4, c5) \
(str5cmp(m, c0, c1, c2, c3, c4) && m[5] == c5)
(str4cmp(m, c0, c1, c2, c3) && \
(((uint32_t *) m)[1] & 0xffff) == ((c5 << 8) | c4))
#define str7cmp(m, c0, c1, c2, c3, c4, c5, c6) \
(str6cmp(m, c0, c1, c2, c3, c4, c5) && m[6] == c6)
(str6cmp(m, c0, c1, c2, c3, c4, c5) && (m[6] == c6))
#define str8cmp(m, c0, c1, c2, c3, c4, c5, c6, c7) \
(str7cmp(m, c0, c1, c2, c3, c4, c5, c6) && m[7] == c7)
(str4cmp(m, c0, c1, c2, c3) && \
(((uint32_t *) m)[1] == ((c7 << 24) | (c6 << 16) | (c5 << 8) | c4)))
#define str9cmp(m, c0, c1, c2, c3, c4, c5, c6, c7, c8) \
(str8cmp(m, c0, c1, c2, c3, c4, c5, c6, c7) && m[8] == c8)
#define str10cmp(m, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9) \
(str9cmp(m, c0, c1, c2, c3, c4, c5, c6, c7, c8) && m[9] == c9)
(str8cmp(m, c0, c1, c2, c3, c4, c5, c6, c7) && \
(((uint32_t *) m)[2] & 0xffff) == ((c9 << 8) | c8))
#define str11cmp(m, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10) \
(str10cmp(m, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9) && m[10] == c10)
(str10cmp(m, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9) && (m[10] == c10))
#define str12cmp(m, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11) \
(str11cmp(m, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10) && m[11] == c11)

#endif // CC_LITTLE_ENDIAN

(str8cmp(m, c0, c1, c2, c3, c4, c5, c6, c7) && \
(((uint32_t *) m)[2] == ((c11 << 24) | (c10 << 16) | (c9 << 8) | c8)))
*/

/*
* Wrapper around common routines for manipulating C character strings
Expand Down
7 changes: 0 additions & 7 deletions include/cc_define.h
Expand Up @@ -27,13 +27,6 @@ extern "C" {
# define CC_HAVE_SIGNAME 1
#endif


#ifdef HAVE_BIG_ENDIAN
# define CC_BIG_ENDIAN 1
#else
# define CC_LITTLE_ENDIAN 1
#endif

#ifdef HAVE_STATS
# define CC_STATS 1
#endif
Expand Down
33 changes: 0 additions & 33 deletions include/hash/cc_lookup3.h

This file was deleted.

2 changes: 1 addition & 1 deletion rust/cc_binding/build.rs
Expand Up @@ -48,7 +48,7 @@ fn get_cmake_cache_value(binary_dir: &Path, key: &str) -> Result<Option<String>>
}

fn main() {
println!("cargo:rustc-link-lib=static=ccommon-1.2.0");
println!("cargo:rustc-link-lib=static=ccommon-2.0.0");
if cfg!(target_os = "macos") {
println!("cargo:rustc-link-lib=framework=Security");
}
Expand Down
1 change: 0 additions & 1 deletion src/hash/CMakeLists.txt
@@ -1,5 +1,4 @@
set(SOURCE
${SOURCE}
hash/cc_lookup3.c
hash/cc_murmur3.c
PARENT_SCOPE)

0 comments on commit a6a54d9

Please sign in to comment.