diff --git a/cpp/src/parquet/encodings/encodings.h b/cpp/src/parquet/encodings/encodings.h index 50b6291adfbfb..b30146a2c5919 100644 --- a/cpp/src/parquet/encodings/encodings.h +++ b/cpp/src/parquet/encodings/encodings.h @@ -72,8 +72,9 @@ class Decoder { #include "parquet/encodings/plain-encoding.h" #include "parquet/encodings/dictionary-encoding.h" -#include "parquet/encodings/delta-bit-pack-encoding.h" -#include "parquet/encodings/delta-length-byte-array-encoding.h" -#include "parquet/encodings/delta-byte-array-encoding.h" +// The encoding tools changed and these are missing the ZigZag functions +// #include "parquet/encodings/delta-bit-pack-encoding.h" +// #include "parquet/encodings/delta-length-byte-array-encoding.h" +// #include "parquet/encodings/delta-byte-array-encoding.h" #endif // PARQUET_ENCODINGS_ENCODINGS_H diff --git a/cpp/src/parquet/util/CMakeLists.txt b/cpp/src/parquet/util/CMakeLists.txt index 8c15de1d1afde..1c86112ddec19 100644 --- a/cpp/src/parquet/util/CMakeLists.txt +++ b/cpp/src/parquet/util/CMakeLists.txt @@ -20,6 +20,8 @@ install(FILES bit-stream-utils.h bit-stream-utils.inline.h bit-util.h + cpu-info.h + sse-info.h compiler-util.h logging.h rle-encoding.h @@ -29,6 +31,7 @@ install(FILES add_library(parquet_util STATIC input_stream.cc + cpu-info.cc ) add_library(parquet_test_main @@ -47,3 +50,6 @@ else() pthread ) endif() + +ADD_PARQUET_TEST(bit-util-test) +ADD_PARQUET_TEST(rle-test) diff --git a/cpp/src/parquet/util/bit-stream-utils.h b/cpp/src/parquet/util/bit-stream-utils.h index 97ba71b833eb1..a02839dc3b438 100644 --- a/cpp/src/parquet/util/bit-stream-utils.h +++ b/cpp/src/parquet/util/bit-stream-utils.h @@ -15,26 +15,28 @@ // specific language governing permissions and limitations // under the License. +// From Apache Impala as of 2016-01-29 + #ifndef PARQUET_UTIL_BIT_STREAM_UTILS_H #define PARQUET_UTIL_BIT_STREAM_UTILS_H -#include #include #include +#include #include "parquet/util/compiler-util.h" -#include "parquet/util/bit-util.h" #include "parquet/util/logging.h" +#include "parquet/util/bit-util.h" namespace parquet_cpp { -// Utility class to write bit/byte streams. This class can write data to either be -// bit packed or byte aligned (and a single stream that has a mix of both). -// This class does not allocate memory. +/// Utility class to write bit/byte streams. This class can write data to either be +/// bit packed or byte aligned (and a single stream that has a mix of both). +/// This class does not allocate memory. class BitWriter { public: - // buffer: buffer to write bits to. Buffer should be preallocated with - // 'buffer_len' bytes. + /// buffer: buffer to write bits to. Buffer should be preallocated with + /// 'buffer_len' bytes. BitWriter(uint8_t* buffer, int buffer_len) : buffer_(buffer), max_bytes_(buffer_len) { @@ -47,56 +49,56 @@ class BitWriter { bit_offset_ = 0; } - // The number of current bytes written, including the current byte (i.e. may include a - // fraction of a byte). Includes buffered values. + /// The number of current bytes written, including the current byte (i.e. may include a + /// fraction of a byte). Includes buffered values. int bytes_written() const { return byte_offset_ + BitUtil::Ceil(bit_offset_, 8); } uint8_t* buffer() const { return buffer_; } int buffer_len() const { return max_bytes_; } - // Writes a value to buffered_values_, flushing to buffer_ if necessary. This is bit - // packed. Returns false if there was not enough space. num_bits must be <= 32. + /// Writes a value to buffered_values_, flushing to buffer_ if necessary. This is bit + /// packed. Returns false if there was not enough space. num_bits must be <= 32. bool PutValue(uint64_t v, int num_bits); - // Writes v to the next aligned byte using num_bytes. If T is larger than num_bytes, the - // extra high-order bytes will be ignored. Returns false if there was not enough space. + /// Writes v to the next aligned byte using num_bytes. If T is larger than + /// num_bytes, the extra high-order bytes will be ignored. Returns false if + /// there was not enough space. template bool PutAligned(T v, int num_bytes); - // Write a Vlq encoded int to the buffer. Returns false if there was not enough - // room. The value is written byte aligned. - // For more details on vlq: - // en.wikipedia.org/wiki/Variable-length_quantity - bool PutVlqInt(uint32_t v); - bool PutZigZagVlqInt(int32_t v); + /// Write a Vlq encoded int to the buffer. Returns false if there was not enough + /// room. The value is written byte aligned. + /// For more details on vlq: + /// en.wikipedia.org/wiki/Variable-length_quantity + bool PutVlqInt(int32_t v); - // Get a pointer to the next aligned byte and advance the underlying buffer - // by num_bytes. - // Returns NULL if there was not enough space. + /// Get a pointer to the next aligned byte and advance the underlying buffer + /// by num_bytes. + /// Returns NULL if there was not enough space. uint8_t* GetNextBytePtr(int num_bytes = 1); - // Flushes all buffered values to the buffer. Call this when done writing to the buffer. - // If 'align' is true, buffered_values_ is reset and any future writes will be written - // to the next byte boundary. + /// Flushes all buffered values to the buffer. Call this when done writing to + /// the buffer. If 'align' is true, buffered_values_ is reset and any future + /// writes will be written to the next byte boundary. void Flush(bool align = false); private: uint8_t* buffer_; int max_bytes_; - // Bit-packed values are initially written to this variable before being memcpy'd to - // buffer_. This is faster than writing values byte by byte directly to buffer_. + /// Bit-packed values are initially written to this variable before being memcpy'd to + /// buffer_. This is faster than writing values byte by byte directly to buffer_. uint64_t buffered_values_; int byte_offset_; // Offset in buffer_ int bit_offset_; // Offset in buffered_values_ }; -// Utility class to read bit/byte stream. This class can read bits or bytes -// that are either byte aligned or not. It also has utilities to read multiple -// bytes in one read (e.g. encoded int). +/// Utility class to read bit/byte stream. This class can read bits or bytes +/// that are either byte aligned or not. It also has utilities to read multiple +/// bytes in one read (e.g. encoded int). class BitReader { public: - // 'buffer' is the buffer to read from. The buffer's length is 'buffer_len'. + /// 'buffer' is the buffer to read from. The buffer's length is 'buffer_len'. BitReader(const uint8_t* buffer, int buffer_len) : buffer_(buffer), max_bytes_(buffer_len), @@ -108,36 +110,48 @@ class BitReader { BitReader() : buffer_(NULL), max_bytes_(0) {} - // Gets the next value from the buffer. Returns true if 'v' could be read or false if - // there are not enough bytes left. num_bits must be <= 32. + void Reset(const uint8_t* buffer, int buffer_len) { + buffer_ = buffer; + max_bytes_ = buffer_len; + byte_offset_ = 0; + bit_offset_ = 0; + } + + /// Gets the next value from the buffer. Returns true if 'v' could be read or false if + /// there are not enough bytes left. num_bits must be <= 32. template bool GetValue(int num_bits, T* v); - // Reads a 'num_bytes'-sized value from the buffer and stores it in 'v'. T needs to be a - // little-endian native type and big enough to store 'num_bytes'. The value is assumed - // to be byte-aligned so the stream will be advanced to the start of the next byte - // before 'v' is read. Returns false if there are not enough bytes left. + /// Reads a 'num_bytes'-sized value from the buffer and stores it in 'v'. T + /// needs to be a little-endian native type and big enough to store + /// 'num_bytes'. The value is assumed to be byte-aligned so the stream will + /// be advanced to the start of the next byte before 'v' is read. Returns + /// false if there are not enough bytes left. template bool GetAligned(int num_bytes, T* v); - // Reads a vlq encoded int from the stream. The encoded int must start at the - // beginning of a byte. Return false if there were not enough bytes in the buffer. - bool GetVlqInt(uint64_t* v); - bool GetZigZagVlqInt(int64_t* v); + /// Reads a vlq encoded int from the stream. The encoded int must start at + /// the beginning of a byte. Return false if there were not enough bytes in + /// the buffer. + bool GetVlqInt(int32_t* v); - // Returns the number of bytes left in the stream, not including the current byte (i.e., - // there may be an additional fraction of a byte). + /// Returns the number of bytes left in the stream, not including the current + /// byte (i.e., there may be an additional fraction of a byte). int bytes_left() { return max_bytes_ - (byte_offset_ + BitUtil::Ceil(bit_offset_, 8)); } - // Maximum byte length of a vlq encoded int + /// Maximum byte length of a vlq encoded int static const int MAX_VLQ_BYTE_LEN = 5; + // TODO(nongli): implementations to be fixed given changes in Impala + // bool GetZigZagVlqInt(int64_t* v); + // bool PutZigZagVlqInt(int32_t v); + private: const uint8_t* buffer_; int max_bytes_; - // Bytes are memcpy'd from buffer_ and values are read from this variable. This is - // faster than reading values byte by byte directly from buffer_. + /// Bytes are memcpy'd from buffer_ and values are read from this variable. This is + /// faster than reading values byte by byte directly from buffer_. uint64_t buffered_values_; int byte_offset_; // Offset in buffer_ @@ -146,4 +160,4 @@ class BitReader { } // namespace parquet_cpp -#endif +#endif // PARQUET_UTIL_BIT_STREAM_UTILS_H diff --git a/cpp/src/parquet/util/bit-stream-utils.inline.h b/cpp/src/parquet/util/bit-stream-utils.inline.h index 6171b1fb04f6a..77e2d48817110 100644 --- a/cpp/src/parquet/util/bit-stream-utils.inline.h +++ b/cpp/src/parquet/util/bit-stream-utils.inline.h @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +// From Apache Impala as of 2016-01-29 + #ifndef PARQUET_UTIL_BIT_STREAM_UTILS_INLINE_H #define PARQUET_UTIL_BIT_STREAM_UTILS_INLINE_H @@ -73,7 +75,7 @@ inline bool BitWriter::PutAligned(T val, int num_bytes) { return true; } -inline bool BitWriter::PutVlqInt(uint32_t v) { +inline bool BitWriter::PutVlqInt(int32_t v) { bool result = true; while ((v & 0xFFFFFF80) != 0L) { result &= PutAligned((v & 0x7F) | 0x80, 1); @@ -83,13 +85,9 @@ inline bool BitWriter::PutVlqInt(uint32_t v) { return result; } -inline bool BitWriter::PutZigZagVlqInt(int32_t v) { - uint32_t u = (v << 1) ^ (v >> 31); - return PutVlqInt(u); -} - template inline bool BitReader::GetValue(int num_bits, T* v) { + DCHECK(buffer_ != NULL); // TODO: revisit this limit if necessary DCHECK_LE(num_bits, 32); DCHECK_LE(num_bits, sizeof(T) * 8); @@ -140,7 +138,7 @@ inline bool BitReader::GetAligned(int num_bytes, T* v) { return true; } -inline bool BitReader::GetVlqInt(uint64_t* v) { +inline bool BitReader::GetVlqInt(int32_t* v) { *v = 0; int shift = 0; int num_bytes = 0; @@ -154,12 +152,20 @@ inline bool BitReader::GetVlqInt(uint64_t* v) { return true; } -inline bool BitReader::GetZigZagVlqInt(int64_t* v) { - uint64_t u; - if (!GetVlqInt(&u)) return false; - *reinterpret_cast(v) = (u >> 1) ^ -(u & 1); - return true; -} +// TODO(nongli): review/test these implementations given divergence in Impala +// functions + +// inline bool BitWriter::PutZigZagVlqInt(int32_t v) { +// uint32_t u = (v << 1) ^ (v >> 31); +// return PutVlqInt(u); +// } + +// inline bool BitReader::GetZigZagVlqInt(int64_t* v) { +// uint64_t u; +// if (!GetVlqInt(&u)) return false; +// *reinterpret_cast(v) = (u >> 1) ^ -(u & 1); +// return true; +// } } // namespace parquet_cpp diff --git a/cpp/src/parquet/util/bit-util-test.cc b/cpp/src/parquet/util/bit-util-test.cc new file mode 100644 index 0000000000000..78efe1a85536e --- /dev/null +++ b/cpp/src/parquet/util/bit-util-test.cc @@ -0,0 +1,164 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// From Apache Impala as of 2016-01-29 + +#include +#include +#include +#include + +#include +#include + +#include "parquet/util/bit-util.h" +#include "parquet/util/cpu-info.h" + +namespace parquet_cpp { + +TEST(BitUtil, Ceil) { + EXPECT_EQ(BitUtil::Ceil(0, 1), 0); + EXPECT_EQ(BitUtil::Ceil(1, 1), 1); + EXPECT_EQ(BitUtil::Ceil(1, 2), 1); + EXPECT_EQ(BitUtil::Ceil(1, 8), 1); + EXPECT_EQ(BitUtil::Ceil(7, 8), 1); + EXPECT_EQ(BitUtil::Ceil(8, 8), 1); + EXPECT_EQ(BitUtil::Ceil(9, 8), 2); + EXPECT_EQ(BitUtil::Ceil(9, 9), 1); + EXPECT_EQ(BitUtil::Ceil(10000000000, 10), 1000000000); + EXPECT_EQ(BitUtil::Ceil(10, 10000000000), 1); + EXPECT_EQ(BitUtil::Ceil(100000000000, 10000000000), 10); +} + +TEST(BitUtil, RoundUp) { + EXPECT_EQ(BitUtil::RoundUp(0, 1), 0); + EXPECT_EQ(BitUtil::RoundUp(1, 1), 1); + EXPECT_EQ(BitUtil::RoundUp(1, 2), 2); + EXPECT_EQ(BitUtil::RoundUp(6, 2), 6); + EXPECT_EQ(BitUtil::RoundUp(7, 3), 9); + EXPECT_EQ(BitUtil::RoundUp(9, 9), 9); + EXPECT_EQ(BitUtil::RoundUp(10000000001, 10), 10000000010); + EXPECT_EQ(BitUtil::RoundUp(10, 10000000000), 10000000000); + EXPECT_EQ(BitUtil::RoundUp(100000000000, 10000000000), 100000000000); +} + +TEST(BitUtil, RoundDown) { + EXPECT_EQ(BitUtil::RoundDown(0, 1), 0); + EXPECT_EQ(BitUtil::RoundDown(1, 1), 1); + EXPECT_EQ(BitUtil::RoundDown(1, 2), 0); + EXPECT_EQ(BitUtil::RoundDown(6, 2), 6); + EXPECT_EQ(BitUtil::RoundDown(7, 3), 6); + EXPECT_EQ(BitUtil::RoundDown(9, 9), 9); + EXPECT_EQ(BitUtil::RoundDown(10000000001, 10), 10000000000); + EXPECT_EQ(BitUtil::RoundDown(10, 10000000000), 0); + EXPECT_EQ(BitUtil::RoundDown(100000000000, 10000000000), 100000000000); +} + +TEST(BitUtil, Popcount) { + EXPECT_EQ(BitUtil::Popcount(BOOST_BINARY(0 1 0 1 0 1 0 1)), 4); + EXPECT_EQ(BitUtil::PopcountNoHw(BOOST_BINARY(0 1 0 1 0 1 0 1)), 4); + EXPECT_EQ(BitUtil::Popcount(BOOST_BINARY(1 1 1 1 0 1 0 1)), 6); + EXPECT_EQ(BitUtil::PopcountNoHw(BOOST_BINARY(1 1 1 1 0 1 0 1)), 6); + EXPECT_EQ(BitUtil::Popcount(BOOST_BINARY(1 1 1 1 1 1 1 1)), 8); + EXPECT_EQ(BitUtil::PopcountNoHw(BOOST_BINARY(1 1 1 1 1 1 1 1)), 8); + EXPECT_EQ(BitUtil::Popcount(0), 0); + EXPECT_EQ(BitUtil::PopcountNoHw(0), 0); +} + +TEST(BitUtil, TrailingBits) { + EXPECT_EQ(BitUtil::TrailingBits(BOOST_BINARY(1 1 1 1 1 1 1 1), 0), 0); + EXPECT_EQ(BitUtil::TrailingBits(BOOST_BINARY(1 1 1 1 1 1 1 1), 1), 1); + EXPECT_EQ(BitUtil::TrailingBits(BOOST_BINARY(1 1 1 1 1 1 1 1), 64), + BOOST_BINARY(1 1 1 1 1 1 1 1)); + EXPECT_EQ(BitUtil::TrailingBits(BOOST_BINARY(1 1 1 1 1 1 1 1), 100), + BOOST_BINARY(1 1 1 1 1 1 1 1)); + EXPECT_EQ(BitUtil::TrailingBits(0, 1), 0); + EXPECT_EQ(BitUtil::TrailingBits(0, 64), 0); + EXPECT_EQ(BitUtil::TrailingBits(1LL << 63, 0), 0); + EXPECT_EQ(BitUtil::TrailingBits(1LL << 63, 63), 0); + EXPECT_EQ(BitUtil::TrailingBits(1LL << 63, 64), 1LL << 63); +} + +TEST(BitUtil, ByteSwap) { + EXPECT_EQ(BitUtil::ByteSwap(static_cast(0)), 0); + EXPECT_EQ(BitUtil::ByteSwap(static_cast(0x11223344)), 0x44332211); + + EXPECT_EQ(BitUtil::ByteSwap(static_cast(0)), 0); + EXPECT_EQ(BitUtil::ByteSwap(static_cast(0x11223344)), 0x44332211); + + EXPECT_EQ(BitUtil::ByteSwap(static_cast(0)), 0); + EXPECT_EQ(BitUtil::ByteSwap( + static_cast(0x1122334455667788)), 0x8877665544332211); + + EXPECT_EQ(BitUtil::ByteSwap(static_cast(0)), 0); + EXPECT_EQ(BitUtil::ByteSwap( + static_cast(0x1122334455667788)), 0x8877665544332211); + + EXPECT_EQ(BitUtil::ByteSwap(static_cast(0)), 0); + EXPECT_EQ(BitUtil::ByteSwap(static_cast(0x1122)), 0x2211); + + EXPECT_EQ(BitUtil::ByteSwap(static_cast(0)), 0); + EXPECT_EQ(BitUtil::ByteSwap(static_cast(0x1122)), 0x2211); +} + +TEST(BitUtil, Log2) { + EXPECT_EQ(BitUtil::Log2(1), 0); + EXPECT_EQ(BitUtil::Log2(2), 1); + EXPECT_EQ(BitUtil::Log2(3), 2); + EXPECT_EQ(BitUtil::Log2(4), 2); + EXPECT_EQ(BitUtil::Log2(5), 3); + EXPECT_EQ(BitUtil::Log2(INT_MAX), 31); + EXPECT_EQ(BitUtil::Log2(UINT_MAX), 32); + EXPECT_EQ(BitUtil::Log2(ULLONG_MAX), 64); +} + +TEST(BitUtil, RoundUpToPowerOf2) { + EXPECT_EQ(BitUtil::RoundUpToPowerOf2(7, 8), 8); + EXPECT_EQ(BitUtil::RoundUpToPowerOf2(8, 8), 8); + EXPECT_EQ(BitUtil::RoundUpToPowerOf2(9, 8), 16); +} + +TEST(BitUtil, RoundDownToPowerOf2) { + EXPECT_EQ(BitUtil::RoundDownToPowerOf2(7, 8), 0); + EXPECT_EQ(BitUtil::RoundDownToPowerOf2(8, 8), 8); + EXPECT_EQ(BitUtil::RoundDownToPowerOf2(9, 8), 8); +} + +TEST(BitUtil, RoundUpDown) { + EXPECT_EQ(BitUtil::RoundUpNumBytes(7), 1); + EXPECT_EQ(BitUtil::RoundUpNumBytes(8), 1); + EXPECT_EQ(BitUtil::RoundUpNumBytes(9), 2); + EXPECT_EQ(BitUtil::RoundDownNumBytes(7), 0); + EXPECT_EQ(BitUtil::RoundDownNumBytes(8), 1); + EXPECT_EQ(BitUtil::RoundDownNumBytes(9), 1); + + EXPECT_EQ(BitUtil::RoundUpNumi32(31), 1); + EXPECT_EQ(BitUtil::RoundUpNumi32(32), 1); + EXPECT_EQ(BitUtil::RoundUpNumi32(33), 2); + EXPECT_EQ(BitUtil::RoundDownNumi32(31), 0); + EXPECT_EQ(BitUtil::RoundDownNumi32(32), 1); + EXPECT_EQ(BitUtil::RoundDownNumi32(33), 1); + + EXPECT_EQ(BitUtil::RoundUpNumi64(63), 1); + EXPECT_EQ(BitUtil::RoundUpNumi64(64), 1); + EXPECT_EQ(BitUtil::RoundUpNumi64(65), 2); + EXPECT_EQ(BitUtil::RoundDownNumi64(63), 0); + EXPECT_EQ(BitUtil::RoundDownNumi64(64), 1); + EXPECT_EQ(BitUtil::RoundDownNumi64(65), 1); +} + +} // namespace parquet_cpp diff --git a/cpp/src/parquet/util/bit-util.h b/cpp/src/parquet/util/bit-util.h index 593d1c2cfe447..4db585a0ccc67 100644 --- a/cpp/src/parquet/util/bit-util.h +++ b/cpp/src/parquet/util/bit-util.h @@ -15,47 +15,132 @@ // specific language governing permissions and limitations // under the License. +// From Apache Impala as of 2016-01-29 + #ifndef PARQUET_UTIL_BIT_UTIL_H #define PARQUET_UTIL_BIT_UTIL_H #if defined(__APPLE__) - #include +#include #else - #include +#include #endif +#include + #include "parquet/util/compiler-util.h" -#include "parquet/util/logging.h" +#include "parquet/util/cpu-info.h" +#include "parquet/util/sse-util.h" namespace parquet_cpp { -// Utility class to do standard bit tricks -// TODO: is this in boost or something else like that? +using boost::make_unsigned; + +/// Utility class to do standard bit tricks +/// TODO: is this in boost or something else like that? class BitUtil { public: - // Returns the ceil of value/divisor - static inline int Ceil(int value, int divisor) { + /// Returns the ceil of value/divisor + static inline int64_t Ceil(int64_t value, int64_t divisor) { return value / divisor + (value % divisor != 0); } - // Returns 'value' rounded up to the nearest multiple of 'factor' - static inline int RoundUp(int value, int factor) { + /// Returns 'value' rounded up to the nearest multiple of 'factor' + static inline int64_t RoundUp(int64_t value, int64_t factor) { return (value + (factor - 1)) / factor * factor; } - // Returns 'value' rounded down to the nearest multiple of 'factor' - static inline int RoundDown(int value, int factor) { + /// Returns 'value' rounded down to the nearest multiple of 'factor' + static inline int64_t RoundDown(int64_t value, int64_t factor) { return (value / factor) * factor; } - // Returns the number of set bits in x - static inline int Popcount(uint64_t x) { + /// Returns the smallest power of two that contains v. Taken from + /// http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 + /// TODO: Pick a better name, as it is not clear what happens when the input is + /// already a power of two. + static inline int64_t NextPowerOfTwo(int64_t v) { + --v; + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + v |= v >> 32; + ++v; + return v; + } + + /// Returns 'value' rounded up to the nearest multiple of 'factor' when factor is + /// a power of two + static inline int RoundUpToPowerOf2(int value, int factor) { + DCHECK((factor > 0) && ((factor & (factor - 1)) == 0)); + return (value + (factor - 1)) & ~(factor - 1); + } + + static inline int RoundDownToPowerOf2(int value, int factor) { + DCHECK((factor > 0) && ((factor & (factor - 1)) == 0)); + return value & ~(factor - 1); + } + + /// Specialized round up and down functions for frequently used factors, + /// like 8 (bits->bytes), 32 (bits->i32), and 64 (bits->i64). + /// Returns the rounded up number of bytes that fit the number of bits. + static inline uint32_t RoundUpNumBytes(uint32_t bits) { + return (bits + 7) >> 3; + } + + /// Returns the rounded down number of bytes that fit the number of bits. + static inline uint32_t RoundDownNumBytes(uint32_t bits) { + return bits >> 3; + } + + /// Returns the rounded up to 32 multiple. Used for conversions of bits to i32. + static inline uint32_t RoundUpNumi32(uint32_t bits) { + return (bits + 31) >> 5; + } + + /// Returns the rounded up 32 multiple. + static inline uint32_t RoundDownNumi32(uint32_t bits) { + return bits >> 5; + } + + /// Returns the rounded up to 64 multiple. Used for conversions of bits to i64. + static inline uint32_t RoundUpNumi64(uint32_t bits) { + return (bits + 63) >> 6; + } + + /// Returns the rounded down to 64 multiple. + static inline uint32_t RoundDownNumi64(uint32_t bits) { + return bits >> 6; + } + + /// Non hw accelerated pop count. + /// TODO: we don't use this in any perf sensitive code paths currently. There + /// might be a much faster way to implement this. + static inline int PopcountNoHw(uint64_t x) { int count = 0; for (; x != 0; ++count) x &= x-1; return count; } - // Returns the 'num_bits' least-significant bits of 'v'. + /// Returns the number of set bits in x + static inline int Popcount(uint64_t x) { + if (LIKELY(CpuInfo::IsSupported(CpuInfo::POPCNT))) { + return POPCNT_popcnt_u64(x); + } else { + return PopcountNoHw(x); + } + } + + // Compute correct population count for various-width signed integers + template + static inline int PopcountSigned(T v) { + // Converting to same-width unsigned then extending preserves the bit pattern. + return BitUtil::Popcount(static_cast::type>(v)); + } + + /// Returns the 'num_bits' least-significant bits of 'v'. static inline uint64_t TrailingBits(uint64_t v, int num_bits) { if (UNLIKELY(num_bits == 0)) return 0; if (UNLIKELY(num_bits >= 64)) return v; @@ -63,11 +148,12 @@ class BitUtil { return (v << n) >> n; } - // Returns ceil(log2(x)). - // TODO: this could be faster if we use __builtin_clz. Fix this if this ever shows up - // in a hot path. + /// Returns ceil(log2(x)). + /// TODO: this could be faster if we use __builtin_clz. Fix this if this ever shows up + /// in a hot path. static inline int Log2(uint64_t x) { - if (x == 0) return 0; + DCHECK_GT(x, 0); + if (x == 1) return 0; // Compute result = ceil(log2(x)) // = floor(log2(x - 1)) + 1, for x > 1 // by finding the position of the most significant bit (1-indexed) of x - 1 @@ -78,15 +164,7 @@ class BitUtil { return result; } - // Returns the minimum number of bits needed to represent the value of 'x' - static inline int NumRequiredBits(uint64_t x) { - for (int i = 63; i >= 0; --i) { - if (x & 1L << i) return i + 1; - } - return 0; - } - - // Swaps the byte order (i.e. endianess) + /// Swaps the byte order (i.e. endianess) static inline int64_t ByteSwap(int64_t value) { return __builtin_bswap64(value); } @@ -106,7 +184,7 @@ class BitUtil { return static_cast(ByteSwap(static_cast(value))); } - // Write the swapped bytes into dst. Src and st cannot overlap. + /// Write the swapped bytes into dst. Src and st cannot overlap. static inline void ByteSwap(void* dst, const void* src, int len) { switch (len) { case 1: @@ -134,8 +212,8 @@ class BitUtil { } } - // Converts to big endian format (if not already in big endian) from the - // machine's native endian format. + /// Converts to big endian format (if not already in big endian) from the + /// machine's native endian format. #if __BYTE_ORDER == __LITTLE_ENDIAN static inline int64_t ToBigEndian(int64_t value) { return ByteSwap(value); } static inline uint64_t ToBigEndian(uint64_t value) { return ByteSwap(value); } @@ -152,7 +230,7 @@ class BitUtil { static inline uint16_t ToBigEndian(uint16_t val) { return val; } #endif - // Converts from big endian format to the machine's native endian format. + /// Converts from big endian format to the machine's native endian format. #if __BYTE_ORDER == __LITTLE_ENDIAN static inline int64_t FromBigEndian(int64_t value) { return ByteSwap(value); } static inline uint64_t FromBigEndian(uint64_t value) { return ByteSwap(value); } @@ -168,6 +246,36 @@ class BitUtil { static inline int16_t FromBigEndian(int16_t val) { return val; } static inline uint16_t FromBigEndian(uint16_t val) { return val; } #endif + + // Logical right shift for signed integer types + // This is needed because the C >> operator does arithmetic right shift + // Negative shift amounts lead to undefined behavior + template + static T ShiftRightLogical(T v, int shift) { + // Conversion to unsigned ensures most significant bits always filled with 0's + return static_cast::type>(v) >> shift; + } + + // Get an specific bit of a numeric type + template + static inline int8_t GetBit(T v, int bitpos) { + T masked = v & (static_cast(0x1) << bitpos); + return static_cast(ShiftRightLogical(masked, bitpos)); + } + + // Set a specific bit to 1 + // Behavior when bitpos is negative is undefined + template + static T SetBit(T v, int bitpos) { + return v | (static_cast(0x1) << bitpos); + } + + // Set a specific bit to 0 + // Behavior when bitpos is negative is undefined + template + static T UnsetBit(T v, int bitpos) { + return v & ~(static_cast(0x1) << bitpos); + } }; } // namespace parquet_cpp diff --git a/cpp/src/parquet/util/cpu-info.cc b/cpp/src/parquet/util/cpu-info.cc new file mode 100644 index 0000000000000..610fb623ed042 --- /dev/null +++ b/cpp/src/parquet/util/cpu-info.cc @@ -0,0 +1,169 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// From Apache Impala as of 2016-01-29. Pared down to a minimal set of +// functions needed for parquet-cpp + +#include "parquet/util/cpu-info.h" + +#ifdef __APPLE__ +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "parquet/exception.h" + +using boost::algorithm::contains; +using boost::algorithm::trim; +using std::max; +using std::string; + +namespace parquet_cpp { + +bool CpuInfo::initialized_ = false; +int64_t CpuInfo::hardware_flags_ = 0; +int64_t CpuInfo::original_hardware_flags_; +int64_t CpuInfo::cache_sizes_[L3_CACHE + 1]; +int64_t CpuInfo::cycles_per_ms_; +int CpuInfo::num_cores_ = 1; +string CpuInfo::model_name_ = "unknown"; // NOLINT + +static struct { + string name; + int64_t flag; +} flag_mappings[] = { + { "ssse3", CpuInfo::SSSE3 }, + { "sse4_1", CpuInfo::SSE4_1 }, + { "sse4_2", CpuInfo::SSE4_2 }, + { "popcnt", CpuInfo::POPCNT }, +}; +static const int64_t num_flags = sizeof(flag_mappings) / sizeof(flag_mappings[0]); + +// Helper function to parse for hardware flags. +// values contains a list of space-seperated flags. check to see if the flags we +// care about are present. +// Returns a bitmap of flags. +int64_t ParseCPUFlags(const string& values) { + int64_t flags = 0; + for (int i = 0; i < num_flags; ++i) { + if (contains(values, flag_mappings[i].name)) { + flags |= flag_mappings[i].flag; + } + } + return flags; +} + +void CpuInfo::Init() { + string line; + string name; + string value; + + float max_mhz = 0; + int num_cores = 0; + + memset(&cache_sizes_, 0, sizeof(cache_sizes_)); + + // Read from /proc/cpuinfo + std::ifstream cpuinfo("/proc/cpuinfo", std::ios::in); + while (cpuinfo) { + getline(cpuinfo, line); + size_t colon = line.find(':'); + if (colon != string::npos) { + name = line.substr(0, colon - 1); + value = line.substr(colon + 1, string::npos); + trim(name); + trim(value); + if (name.compare("flags") == 0) { + hardware_flags_ |= ParseCPUFlags(value); + } else if (name.compare("cpu MHz") == 0) { + // Every core will report a different speed. We'll take the max, assuming + // that when impala is running, the core will not be in a lower power state. + // TODO: is there a more robust way to do this, such as + // Window's QueryPerformanceFrequency() + float mhz = atof(value.c_str()); + max_mhz = max(mhz, max_mhz); + } else if (name.compare("processor") == 0) { + ++num_cores; + } else if (name.compare("model name") == 0) { + model_name_ = value; + } + } + } + if (cpuinfo.is_open()) cpuinfo.close(); + +#ifdef __APPLE__ + // On Mac OS X use sysctl() to get the cache sizes + size_t len = 0; + sysctlbyname("hw.cachesize", NULL, &len, NULL, 0); + uint64_t* data = static_cast(malloc(len)); + sysctlbyname("hw.cachesize", data, &len, NULL, 0); + DCHECK(len / sizeof(uint64_t) >= 3); + for (size_t i = 0; i < 3; ++i) { + cache_sizes_[i] = data[i]; + } +#else + // Call sysconf to query for the cache sizes + cache_sizes_[0] = sysconf(_SC_LEVEL1_DCACHE_SIZE); + cache_sizes_[1] = sysconf(_SC_LEVEL2_CACHE_SIZE); + cache_sizes_[2] = sysconf(_SC_LEVEL3_CACHE_SIZE); +#endif + + if (max_mhz != 0) { + cycles_per_ms_ = max_mhz * 1000; + } else { + cycles_per_ms_ = 1000000; + } + original_hardware_flags_ = hardware_flags_; + + if (num_cores > 0) { + num_cores_ = num_cores; + } else { + num_cores_ = 1; + } + + initialized_ = true; +} + +void CpuInfo::VerifyCpuRequirements() { + if (!CpuInfo::IsSupported(CpuInfo::SSSE3)) { + throw ParquetException("CPU does not support the Supplemental SSE3 instruction set"); + } +} + +void CpuInfo::EnableFeature(int64_t flag, bool enable) { + DCHECK(initialized_); + if (!enable) { + hardware_flags_ &= ~flag; + } else { + // Can't turn something on that can't be supported + DCHECK((original_hardware_flags_ & flag) != 0); + hardware_flags_ |= flag; + } +} + +} // namespace parquet_cpp diff --git a/cpp/src/parquet/util/cpu-info.h b/cpp/src/parquet/util/cpu-info.h new file mode 100644 index 0000000000000..9026cde6630ec --- /dev/null +++ b/cpp/src/parquet/util/cpu-info.h @@ -0,0 +1,108 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// From Apache Impala as of 2016-01-29. Pared down to a minimal set of +// functions needed for parquet-cpp + +#ifndef PARQUET_UTIL_CPU_INFO_H +#define PARQUET_UTIL_CPU_INFO_H + +#include +#include + +#include "parquet/util/logging.h" + +namespace parquet_cpp { + +/// CpuInfo is an interface to query for cpu information at runtime. The caller can +/// ask for the sizes of the caches and what hardware features are supported. +/// On Linux, this information is pulled from a couple of sys files (/proc/cpuinfo and +/// /sys/devices) +class CpuInfo { + public: + static const int64_t SSSE3 = (1 << 1); + static const int64_t SSE4_1 = (1 << 2); + static const int64_t SSE4_2 = (1 << 3); + static const int64_t POPCNT = (1 << 4); + + /// Cache enums for L1 (data), L2 and L3 + enum CacheLevel { + L1_CACHE = 0, + L2_CACHE = 1, + L3_CACHE = 2, + }; + + /// Initialize CpuInfo. + static void Init(); + + /// Determine if the CPU meets the minimum CPU requirements and if not, issue an error + /// and terminate. + static void VerifyCpuRequirements(); + + /// Returns all the flags for this cpu + static int64_t hardware_flags() { + DCHECK(initialized_); + return hardware_flags_; + } + + /// Returns whether of not the cpu supports this flag + inline static bool IsSupported(int64_t flag) { + DCHECK(initialized_); + return (hardware_flags_ & flag) != 0; + } + + /// Toggle a hardware feature on and off. It is not valid to turn on a feature + /// that the underlying hardware cannot support. This is useful for testing. + static void EnableFeature(int64_t flag, bool enable); + + /// Returns the size of the cache in KB at this cache level + static int64_t CacheSize(CacheLevel level) { + DCHECK(initialized_); + return cache_sizes_[level]; + } + + /// Returns the number of cpu cycles per millisecond + static int64_t cycles_per_ms() { + DCHECK(initialized_); + return cycles_per_ms_; + } + + /// Returns the number of cores (including hyper-threaded) on this machine. + static int num_cores() { + DCHECK(initialized_); + return num_cores_; + } + + /// Returns the model name of the cpu (e.g. Intel i7-2600) + static std::string model_name() { + DCHECK(initialized_); + return model_name_; + } + + private: + static bool initialized_; + static int64_t hardware_flags_; + static int64_t original_hardware_flags_; + static int64_t cache_sizes_[L3_CACHE + 1]; + static int64_t cycles_per_ms_; + static int num_cores_; + static std::string model_name_; // NOLINT +}; + +} // namespace parquet_cpp + +#endif // PARQUET_UTIL_CPU_INFO_H diff --git a/cpp/src/parquet/util/rle-encoding.h b/cpp/src/parquet/util/rle-encoding.h index e65368e7fc7a2..22b2c2fcaf0c6 100644 --- a/cpp/src/parquet/util/rle-encoding.h +++ b/cpp/src/parquet/util/rle-encoding.h @@ -15,74 +15,75 @@ // specific language governing permissions and limitations // under the License. +// From Apache Impala as of 2016-01-29 + #ifndef PARQUET_UTIL_RLE_ENCODING_H #define PARQUET_UTIL_RLE_ENCODING_H -#include #include +#include #include "parquet/util/compiler-util.h" #include "parquet/util/bit-stream-utils.inline.h" #include "parquet/util/bit-util.h" -#include "parquet/util/logging.h" namespace parquet_cpp { -// Utility classes to do run length encoding (RLE) for fixed bit width values. If runs -// are sufficiently long, RLE is used, otherwise, the values are just bit-packed -// (literal encoding). -// For both types of runs, there is a byte-aligned indicator which encodes the length -// of the run and the type of the run. -// This encoding has the benefit that when there aren't any long enough runs, values -// are always decoded at fixed (can be precomputed) bit offsets OR both the value and -// the run length are byte aligned. This allows for very efficient decoding -// implementations. -// The encoding is: -// encoded-block := run* -// run := literal-run | repeated-run -// literal-run := literal-indicator < literal bytes > -// repeated-run := repeated-indicator < repeated value. padded to byte boundary > -// literal-indicator := varint_encode( number_of_groups << 1 | 1) -// repeated-indicator := varint_encode( number_of_repetitions << 1 ) +/// Utility classes to do run length encoding (RLE) for fixed bit width values. If runs +/// are sufficiently long, RLE is used, otherwise, the values are just bit-packed +/// (literal encoding). +/// For both types of runs, there is a byte-aligned indicator which encodes the length +/// of the run and the type of the run. +/// This encoding has the benefit that when there aren't any long enough runs, values +/// are always decoded at fixed (can be precomputed) bit offsets OR both the value and +/// the run length are byte aligned. This allows for very efficient decoding +/// implementations. +/// The encoding is: +/// encoded-block := run* +/// run := literal-run | repeated-run +/// literal-run := literal-indicator < literal bytes > +/// repeated-run := repeated-indicator < repeated value. padded to byte boundary > +/// literal-indicator := varint_encode( number_of_groups << 1 | 1) +/// repeated-indicator := varint_encode( number_of_repetitions << 1 ) // -// Each run is preceded by a varint. The varint's least significant bit is -// used to indicate whether the run is a literal run or a repeated run. The rest -// of the varint is used to determine the length of the run (eg how many times the -// value repeats). +/// Each run is preceded by a varint. The varint's least significant bit is +/// used to indicate whether the run is a literal run or a repeated run. The rest +/// of the varint is used to determine the length of the run (eg how many times the +/// value repeats). // -// In the case of literal runs, the run length is always a multiple of 8 (i.e. encode -// in groups of 8), so that no matter the bit-width of the value, the sequence will end -// on a byte boundary without padding. -// Given that we know it is a multiple of 8, we store the number of 8-groups rather than -// the actual number of encoded ints. (This means that the total number of encoded values -// can not be determined from the encoded data, since the number of values in the last -// group may not be a multiple of 8). For the last group of literal runs, we pad -// the group to 8 with zeros. This allows for 8 at a time decoding on the read side -// without the need for additional checks. +/// In the case of literal runs, the run length is always a multiple of 8 (i.e. encode +/// in groups of 8), so that no matter the bit-width of the value, the sequence will end +/// on a byte boundary without padding. +/// Given that we know it is a multiple of 8, we store the number of 8-groups rather than +/// the actual number of encoded ints. (This means that the total number of encoded values +/// can not be determined from the encoded data, since the number of values in the last +/// group may not be a multiple of 8). For the last group of literal runs, we pad +/// the group to 8 with zeros. This allows for 8 at a time decoding on the read side +/// without the need for additional checks. // -// There is a break-even point when it is more storage efficient to do run length -// encoding. For 1 bit-width values, that point is 8 values. They require 2 bytes -// for both the repeated encoding or the literal encoding. This value can always -// be computed based on the bit-width. -// TODO: think about how to use this for strings. The bit packing isn't quite the same. +/// There is a break-even point when it is more storage efficient to do run length +/// encoding. For 1 bit-width values, that point is 8 values. They require 2 bytes +/// for both the repeated encoding or the literal encoding. This value can always +/// be computed based on the bit-width. +/// TODO: think about how to use this for strings. The bit packing isn't quite the same. // -// Examples with bit-width 1 (eg encoding booleans): -// ---------------------------------------- -// 100 1s followed by 100 0s: -// <1, padded to 1 byte>   <0, padded to 1 byte> -// - (total 4 bytes) +/// Examples with bit-width 1 (eg encoding booleans): +/// ---------------------------------------- +/// 100 1s followed by 100 0s: +/// <1, padded to 1 byte>   <0, padded to 1 byte> +/// - (total 4 bytes) // -// alternating 1s and 0s (200 total): -// 200 ints = 25 groups of 8 -// <25 bytes of values, bitpacked> -// (total 26 bytes, 1 byte overhead) +/// alternating 1s and 0s (200 total): +/// 200 ints = 25 groups of 8 +/// <25 bytes of values, bitpacked> +/// (total 26 bytes, 1 byte overhead) // -// Decoder class for RLE encoded data. +/// Decoder class for RLE encoded data. class RleDecoder { public: - // Create a decoder object. buffer/buffer_len is the decoded data. - // bit_width is the width of each value (before encoding). + /// Create a decoder object. buffer/buffer_len is the decoded data. + /// bit_width is the width of each value (before encoding). RleDecoder(const uint8_t* buffer, int buffer_len, int bit_width) : bit_reader_(buffer, buffer_len), bit_width_(bit_width), @@ -93,170 +94,172 @@ class RleDecoder { DCHECK_LE(bit_width_, 64); } - RleDecoder() {} + RleDecoder() : bit_width_(-1) {} + + void Reset(const uint8_t* buffer, int buffer_len, int bit_width) { + DCHECK_GE(bit_width, 0); + DCHECK_LE(bit_width, 64); + bit_reader_.Reset(buffer, buffer_len); + bit_width_ = bit_width; + current_value_ = 0; + repeat_count_ = 0; + literal_count_ = 0; + } - // Gets the next value. Returns false if there are no more. + /// Gets the next value. Returns false if there are no more. template bool Get(T* val); - private: + protected: BitReader bit_reader_; + /// Number of bits needed to encode the value. Must be between 0 and 64. int bit_width_; uint64_t current_value_; uint32_t repeat_count_; uint32_t literal_count_; + + private: + /// Fills literal_count_ and repeat_count_ with next values. Returns false if there + /// are no more. + template + bool NextCounts(); }; -// Class to incrementally build the rle data. This class does not allocate any memory. -// The encoding has two modes: encoding repeated runs and literal runs. -// If the run is sufficiently short, it is more efficient to encode as a literal run. -// This class does so by buffering 8 values at a time. If they are not all the same -// they are added to the literal run. If they are the same, they are added to the -// repeated run. When we switch modes, the previous run is flushed out. +/// Class to incrementally build the rle data. This class does not allocate any memory. +/// The encoding has two modes: encoding repeated runs and literal runs. +/// If the run is sufficiently short, it is more efficient to encode as a literal run. +/// This class does so by buffering 8 values at a time. If they are not all the same +/// they are added to the literal run. If they are the same, they are added to the +/// repeated run. When we switch modes, the previous run is flushed out. class RleEncoder { public: - // buffer/buffer_len: preallocated output buffer. - // bit_width: max number of bits for value. - // TODO: consider adding a min_repeated_run_length so the caller can control - // when values should be encoded as repeated runs. Currently this is derived - // based on the bit_width, which can determine a storage optimal choice. - // TODO: allow 0 bit_width (and have dict encoder use it) + /// buffer/buffer_len: preallocated output buffer. + /// bit_width: max number of bits for value. + /// TODO: consider adding a min_repeated_run_length so the caller can control + /// when values should be encoded as repeated runs. Currently this is derived + /// based on the bit_width, which can determine a storage optimal choice. + /// TODO: allow 0 bit_width (and have dict encoder use it) RleEncoder(uint8_t* buffer, int buffer_len, int bit_width) : bit_width_(bit_width), bit_writer_(buffer, buffer_len) { - DCHECK_GE(bit_width_, 1); + DCHECK_GE(bit_width_, 0); DCHECK_LE(bit_width_, 64); max_run_byte_size_ = MinBufferSize(bit_width); DCHECK_GE(buffer_len, max_run_byte_size_) << "Input buffer not big enough."; Clear(); } - // Returns the minimum buffer size needed to use the encoder for 'bit_width' - // This is the maximum length of a single run for 'bit_width'. - // It is not valid to pass a buffer less than this length. + /// Returns the minimum buffer size needed to use the encoder for 'bit_width' + /// This is the maximum length of a single run for 'bit_width'. + /// It is not valid to pass a buffer less than this length. static int MinBufferSize(int bit_width) { - // 1 indicator byte and MAX_VALUES_PER_LITERAL_RUN 'bit_width' values. + /// 1 indicator byte and MAX_VALUES_PER_LITERAL_RUN 'bit_width' values. int max_literal_run_size = 1 + BitUtil::Ceil(MAX_VALUES_PER_LITERAL_RUN * bit_width, 8); - // Up to MAX_VLQ_BYTE_LEN indicator and a single 'bit_width' value. + /// Up to MAX_VLQ_BYTE_LEN indicator and a single 'bit_width' value. int max_repeated_run_size = BitReader::MAX_VLQ_BYTE_LEN + BitUtil::Ceil(bit_width, 8); return std::max(max_literal_run_size, max_repeated_run_size); } - // Returns the maximum byte size it could take to encode 'num_values'. + /// Returns the maximum byte size it could take to encode 'num_values'. static int MaxBufferSize(int bit_width, int num_values) { int bytes_per_run = BitUtil::Ceil(bit_width * MAX_VALUES_PER_LITERAL_RUN, 8.0); int num_runs = BitUtil::Ceil(num_values, MAX_VALUES_PER_LITERAL_RUN); int literal_max_size = num_runs + num_runs * bytes_per_run; - int min_run_size = MinBufferSize(bit_width); - return std::max(min_run_size, literal_max_size) + min_run_size; + return std::max(MinBufferSize(bit_width), literal_max_size); } - // Encode value. Returns true if the value fits in buffer, false otherwise. - // This value must be representable with bit_width_ bits. + /// Encode value. Returns true if the value fits in buffer, false otherwise. + /// This value must be representable with bit_width_ bits. bool Put(uint64_t value); - // Flushes any pending values to the underlying buffer. - // Returns the total number of bytes written + /// Flushes any pending values to the underlying buffer. + /// Returns the total number of bytes written int Flush(); - // Resets all the state in the encoder. + /// Resets all the state in the encoder. void Clear(); - // Returns pointer to underlying buffer + /// Returns pointer to underlying buffer uint8_t* buffer() { return bit_writer_.buffer(); } int32_t len() { return bit_writer_.bytes_written(); } private: - // Flushes any buffered values. If this is part of a repeated run, this is largely - // a no-op. - // If it is part of a literal run, this will call FlushLiteralRun, which writes - // out the buffered literal values. - // If 'done' is true, the current run would be written even if it would normally - // have been buffered more. This should only be called at the end, when the - // encoder has received all values even if it would normally continue to be - // buffered. + /// Flushes any buffered values. If this is part of a repeated run, this is largely + /// a no-op. + /// If it is part of a literal run, this will call FlushLiteralRun, which writes + /// out the buffered literal values. + /// If 'done' is true, the current run would be written even if it would normally + /// have been buffered more. This should only be called at the end, when the + /// encoder has received all values even if it would normally continue to be + /// buffered. void FlushBufferedValues(bool done); - // Flushes literal values to the underlying buffer. If update_indicator_byte, - // then the current literal run is complete and the indicator byte is updated. + /// Flushes literal values to the underlying buffer. If update_indicator_byte, + /// then the current literal run is complete and the indicator byte is updated. void FlushLiteralRun(bool update_indicator_byte); - // Flushes a repeated run to the underlying buffer. + /// Flushes a repeated run to the underlying buffer. void FlushRepeatedRun(); - // Checks and sets buffer_full_. This must be called after flushing a run to - // make sure there are enough bytes remaining to encode the next run. + /// Checks and sets buffer_full_. This must be called after flushing a run to + /// make sure there are enough bytes remaining to encode the next run. void CheckBufferFull(); - // The maximum number of values in a single literal run - // (number of groups encodable by a 1-byte indicator * 8) + /// The maximum number of values in a single literal run + /// (number of groups encodable by a 1-byte indicator * 8) static const int MAX_VALUES_PER_LITERAL_RUN = (1 << 6) * 8; - // Number of bits needed to encode the value. + /// Number of bits needed to encode the value. Must be between 0 and 64. const int bit_width_; - // Underlying buffer. + /// Underlying buffer. BitWriter bit_writer_; - // If true, the buffer is full and subsequent Put()'s will fail. + /// If true, the buffer is full and subsequent Put()'s will fail. bool buffer_full_; - // The maximum byte size a single run can take. + /// The maximum byte size a single run can take. int max_run_byte_size_; - // We need to buffer at most 8 values for literals. This happens when the - // bit_width is 1 (so 8 values fit in one byte). - // TODO: generalize this to other bit widths + /// We need to buffer at most 8 values for literals. This happens when the + /// bit_width is 1 (so 8 values fit in one byte). + /// TODO: generalize this to other bit widths int64_t buffered_values_[8]; - // Number of values in buffered_values_ + /// Number of values in buffered_values_ int num_buffered_values_; - // The current (also last) value that was written and the count of how - // many times in a row that value has been seen. This is maintained even - // if we are in a literal run. If the repeat_count_ get high enough, we switch - // to encoding repeated runs. + /// The current (also last) value that was written and the count of how + /// many times in a row that value has been seen. This is maintained even + /// if we are in a literal run. If the repeat_count_ get high enough, we switch + /// to encoding repeated runs. int64_t current_value_; int repeat_count_; - // Number of literals in the current run. This does not include the literals - // that might be in buffered_values_. Only after we've got a group big enough - // can we decide if they should part of the literal_count_ or repeat_count_ + /// Number of literals in the current run. This does not include the literals + /// that might be in buffered_values_. Only after we've got a group big enough + /// can we decide if they should part of the literal_count_ or repeat_count_ int literal_count_; - // Pointer to a byte in the underlying buffer that stores the indicator byte. - // This is reserved as soon as we need a literal run but the value is written - // when the literal run is complete. + /// Pointer to a byte in the underlying buffer that stores the indicator byte. + /// This is reserved as soon as we need a literal run but the value is written + /// when the literal run is complete. uint8_t* literal_indicator_byte_; }; template inline bool RleDecoder::Get(T* val) { + DCHECK_GE(bit_width_, 0); if (UNLIKELY(literal_count_ == 0 && repeat_count_ == 0)) { - // Read the next run's indicator int, it could be a literal or repeated run - // The int is encoded as a vlq-encoded value. - uint64_t indicator_value = 0; - bool result = bit_reader_.GetVlqInt(&indicator_value); - if (!result) return false; - - // lsb indicates if it is a literal run or repeated run - bool is_literal = indicator_value & 1; - if (is_literal) { - literal_count_ = (indicator_value >> 1) * 8; - } else { - repeat_count_ = indicator_value >> 1; - bool result = bit_reader_.GetAligned( - BitUtil::Ceil(bit_width_, 8), reinterpret_cast(¤t_value_)); - DCHECK(result); - } + if (!NextCounts()) return false; } if (LIKELY(repeat_count_ > 0)) { *val = current_value_; --repeat_count_; } else { - DCHECK(literal_count_ > 0); + DCHECK_GT(literal_count_, 0); bool result = bit_reader_.GetValue(bit_width_, val); DCHECK(result); --literal_count_; @@ -265,8 +268,29 @@ inline bool RleDecoder::Get(T* val) { return true; } -// This function buffers input values 8 at a time. After seeing all 8 values, -// it decides whether they should be encoded as a literal or repeated run. +template +bool RleDecoder::NextCounts() { + // Read the next run's indicator int, it could be a literal or repeated run. + // The int is encoded as a vlq-encoded value. + int32_t indicator_value = 0; + bool result = bit_reader_.GetVlqInt(&indicator_value); + if (!result) return false; + + // lsb indicates if it is a literal run or repeated run + bool is_literal = indicator_value & 1; + if (is_literal) { + literal_count_ = (indicator_value >> 1) * 8; + } else { + repeat_count_ = indicator_value >> 1; + bool result = bit_reader_.GetAligned( + BitUtil::Ceil(bit_width_, 8), reinterpret_cast(¤t_value_)); + DCHECK(result); + } + return true; +} + +/// This function buffers input values 8 at a time. After seeing all 8 values, +/// it decides whether they should be encoded as a literal or repeated run. inline bool RleEncoder::Put(uint64_t value) { DCHECK(bit_width_ == 64 || value < (1LL << bit_width_)); if (UNLIKELY(buffer_full_)) return false; @@ -341,8 +365,8 @@ inline void RleEncoder::FlushRepeatedRun() { CheckBufferFull(); } -// Flush the values that have been buffered. At this point we decide whether -// we need to switch between the run types or continue the current one. +/// Flush the values that have been buffered. At this point we decide whether +/// we need to switch between the run types or continue the current one. inline void RleEncoder::FlushBufferedValues(bool done) { if (repeat_count_ >= 8) { // Clear the buffered values. They are part of the repeated run now and we diff --git a/cpp/src/parquet/util/rle-test.cc b/cpp/src/parquet/util/rle-test.cc new file mode 100644 index 0000000000000..b2628e981d6e1 --- /dev/null +++ b/cpp/src/parquet/util/rle-test.cc @@ -0,0 +1,400 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// From Apache Impala as of 2016-01-29 + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "parquet/util/rle-encoding.h" +#include "parquet/util/bit-stream-utils.inline.h" + +using std::vector; + +namespace parquet_cpp { + +const int MAX_WIDTH = 32; + +TEST(BitArray, TestBool) { + const int len = 8; + uint8_t buffer[len]; + + BitWriter writer(buffer, len); + + // Write alternating 0's and 1's + for (int i = 0; i < 8; ++i) { + bool result = writer.PutValue(i % 2, 1); + EXPECT_TRUE(result); + } + writer.Flush(); + EXPECT_EQ((int)buffer[0], BOOST_BINARY(1 0 1 0 1 0 1 0)); + + // Write 00110011 + for (int i = 0; i < 8; ++i) { + bool result = false; + switch (i) { + case 0: + case 1: + case 4: + case 5: + result = writer.PutValue(false, 1); + break; + default: + result = writer.PutValue(true, 1); + break; + } + EXPECT_TRUE(result); + } + writer.Flush(); + + // Validate the exact bit value + EXPECT_EQ((int)buffer[0], BOOST_BINARY(1 0 1 0 1 0 1 0)); + EXPECT_EQ((int)buffer[1], BOOST_BINARY(1 1 0 0 1 1 0 0)); + + // Use the reader and validate + BitReader reader(buffer, len); + for (int i = 0; i < 8; ++i) { + bool val = false; + bool result = reader.GetValue(1, &val); + EXPECT_TRUE(result); + EXPECT_EQ(val, i % 2); + } + + for (int i = 0; i < 8; ++i) { + bool val = false; + bool result = reader.GetValue(1, &val); + EXPECT_TRUE(result); + switch (i) { + case 0: + case 1: + case 4: + case 5: + EXPECT_EQ(val, false); + break; + default: + EXPECT_EQ(val, true); + break; + } + } +} + +// Writes 'num_vals' values with width 'bit_width' and reads them back. +void TestBitArrayValues(int bit_width, int num_vals) { + const int len = BitUtil::Ceil(bit_width * num_vals, 8); + const uint64_t mod = bit_width == 64? 1 : 1LL << bit_width; + + uint8_t buffer[len]; + BitWriter writer(buffer, len); + for (int i = 0; i < num_vals; ++i) { + bool result = writer.PutValue(i % mod, bit_width); + EXPECT_TRUE(result); + } + writer.Flush(); + EXPECT_EQ(writer.bytes_written(), len); + + BitReader reader(buffer, len); + for (int i = 0; i < num_vals; ++i) { + int64_t val; + bool result = reader.GetValue(bit_width, &val); + EXPECT_TRUE(result); + EXPECT_EQ(val, i % mod); + } + EXPECT_EQ(reader.bytes_left(), 0); +} + +TEST(BitArray, TestValues) { + for (int width = 0; width <= MAX_WIDTH; ++width) { + TestBitArrayValues(width, 1); + TestBitArrayValues(width, 2); + // Don't write too many values + TestBitArrayValues(width, (width < 12) ? (1 << width) : 4096); + TestBitArrayValues(width, 1024); + } +} + +// Test some mixed values +TEST(BitArray, TestMixed) { + const int len = 1024; + uint8_t buffer[len]; + bool parity = true; + + BitWriter writer(buffer, len); + for (int i = 0; i < len; ++i) { + bool result; + if (i % 2 == 0) { + result = writer.PutValue(parity, 1); + parity = !parity; + } else { + result = writer.PutValue(i, 10); + } + EXPECT_TRUE(result); + } + writer.Flush(); + + parity = true; + BitReader reader(buffer, len); + for (int i = 0; i < len; ++i) { + bool result; + if (i % 2 == 0) { + bool val; + result = reader.GetValue(1, &val); + EXPECT_EQ(val, parity); + parity = !parity; + } else { + int val; + result = reader.GetValue(10, &val); + EXPECT_EQ(val, i); + } + EXPECT_TRUE(result); + } +} + +// Validates encoding of values by encoding and decoding them. If +// expected_encoding != NULL, also validates that the encoded buffer is +// exactly 'expected_encoding'. +// if expected_len is not -1, it will validate the encoded size is correct. +void ValidateRle(const vector& values, int bit_width, + uint8_t* expected_encoding, int expected_len) { + const int len = 64 * 1024; + uint8_t buffer[len]; + EXPECT_LE(expected_len, len); + + RleEncoder encoder(buffer, len, bit_width); + for (int i = 0; i < values.size(); ++i) { + bool result = encoder.Put(values[i]); + EXPECT_TRUE(result); + } + int encoded_len = encoder.Flush(); + + if (expected_len != -1) { + EXPECT_EQ(encoded_len, expected_len); + } + if (expected_encoding != NULL) { + EXPECT_TRUE(memcmp(buffer, expected_encoding, expected_len) == 0); + } + + // Verify read + RleDecoder decoder(buffer, len, bit_width); + for (int i = 0; i < values.size(); ++i) { + uint64_t val; + bool result = decoder.Get(&val); + EXPECT_TRUE(result); + EXPECT_EQ(values[i], val); + } +} + +TEST(Rle, SpecificSequences) { + const int len = 1024; + uint8_t expected_buffer[len]; + vector values; + + // Test 50 0' followed by 50 1's + values.resize(100); + for (int i = 0; i < 50; ++i) { + values[i] = 0; + } + for (int i = 50; i < 100; ++i) { + values[i] = 1; + } + + // expected_buffer valid for bit width <= 1 byte + expected_buffer[0] = (50 << 1); + expected_buffer[1] = 0; + expected_buffer[2] = (50 << 1); + expected_buffer[3] = 1; + for (int width = 1; width <= 8; ++width) { + ValidateRle(values, width, expected_buffer, 4); + } + + for (int width = 9; width <= MAX_WIDTH; ++width) { + ValidateRle(values, width, NULL, 2 * (1 + BitUtil::Ceil(width, 8))); + } + + // Test 100 0's and 1's alternating + for (int i = 0; i < 100; ++i) { + values[i] = i % 2; + } + int num_groups = BitUtil::Ceil(100, 8); + expected_buffer[0] = (num_groups << 1) | 1; + for (int i = 1; i <= 100/8; ++i) { + expected_buffer[i] = BOOST_BINARY(1 0 1 0 1 0 1 0); + } + // Values for the last 4 0 and 1's. The upper 4 bits should be padded to 0. + expected_buffer[100/8 + 1] = BOOST_BINARY(0 0 0 0 1 0 1 0); + + // num_groups and expected_buffer only valid for bit width = 1 + ValidateRle(values, 1, expected_buffer, 1 + num_groups); + for (int width = 2; width <= MAX_WIDTH; ++width) { + int num_values = BitUtil::Ceil(100, 8) * 8; + ValidateRle(values, width, NULL, 1 + BitUtil::Ceil(width * num_values, 8)); + } +} + +// ValidateRle on 'num_vals' values with width 'bit_width'. If 'value' != -1, that value +// is used, otherwise alternating values are used. +void TestRleValues(int bit_width, int num_vals, int value = -1) { + const uint64_t mod = (bit_width == 64) ? 1 : 1LL << bit_width; + vector values; + for (int v = 0; v < num_vals; ++v) { + values.push_back((value != -1) ? value : (v % mod)); + } + ValidateRle(values, bit_width, NULL, -1); +} + +TEST(Rle, TestValues) { + for (int width = 1; width <= MAX_WIDTH; ++width) { + TestRleValues(width, 1); + TestRleValues(width, 1024); + TestRleValues(width, 1024, 0); + TestRleValues(width, 1024, 1); + } +} + +TEST(Rle, BitWidthZeroRepeated) { + uint8_t buffer[1]; + const int num_values = 15; + buffer[0] = num_values << 1; // repeated indicator byte + RleDecoder decoder(buffer, sizeof(buffer), 0); + uint8_t val; + for (int i = 0; i < num_values; ++i) { + bool result = decoder.Get(&val); + EXPECT_TRUE(result); + EXPECT_EQ(val, 0); // can only encode 0s with bit width 0 + } + EXPECT_FALSE(decoder.Get(&val)); +} + +TEST(Rle, BitWidthZeroLiteral) { + uint8_t buffer[1]; + const int num_groups = 4; + buffer[0] = num_groups << 1 | 1; // literal indicator byte + RleDecoder decoder = RleDecoder(buffer, sizeof(buffer), 0); + const int num_values = num_groups * 8; + uint8_t val; + for (int i = 0; i < num_values; ++i) { + bool result = decoder.Get(&val); + EXPECT_TRUE(result); + EXPECT_EQ(val, 0); // can only encode 0s with bit width 0 + } + EXPECT_FALSE(decoder.Get(&val)); +} + +// Test that writes out a repeated group and then a literal +// group but flush before finishing. +TEST(BitRle, Flush) { + vector values; + for (int i = 0; i < 16; ++i) values.push_back(1); + values.push_back(0); + ValidateRle(values, 1, NULL, -1); + values.push_back(1); + ValidateRle(values, 1, NULL, -1); + values.push_back(1); + ValidateRle(values, 1, NULL, -1); + values.push_back(1); + ValidateRle(values, 1, NULL, -1); +} + +// Test some random sequences. +TEST(BitRle, Random) { + int iters = 0; + while (iters < 1000) { + srand(iters++); + if (iters % 10000 == 0) LOG(ERROR) << "Seed: " << iters; + vector values; + bool parity = 0; + for (int i = 0; i < 1000; ++i) { + int group_size = rand() % 20 + 1; // NOLINT + if (group_size > 16) { + group_size = 1; + } + for (int i = 0; i < group_size; ++i) { + values.push_back(parity); + } + parity = !parity; + } + ValidateRle(values, (iters % MAX_WIDTH) + 1, NULL, -1); + } +} + +// Test a sequence of 1 0's, 2 1's, 3 0's. etc +// e.g. 011000111100000 +TEST(BitRle, RepeatedPattern) { + vector values; + const int min_run = 1; + const int max_run = 32; + + for (int i = min_run; i <= max_run; ++i) { + int v = i % 2; + for (int j = 0; j < i; ++j) { + values.push_back(v); + } + } + + // And go back down again + for (int i = max_run; i >= min_run; --i) { + int v = i % 2; + for (int j = 0; j < i; ++j) { + values.push_back(v); + } + } + + ValidateRle(values, 1, NULL, -1); +} + +TEST(BitRle, Overflow) { + for (int bit_width = 1; bit_width < 32; bit_width += 3) { + const int len = RleEncoder::MinBufferSize(bit_width); + uint8_t buffer[len]; + int num_added = 0; + bool parity = true; + + RleEncoder encoder(buffer, len, bit_width); + // Insert alternating true/false until there is no space left + while (true) { + bool result = encoder.Put(parity); + parity = !parity; + if (!result) break; + ++num_added; + } + + int bytes_written = encoder.Flush(); + EXPECT_LE(bytes_written, len); + EXPECT_GT(num_added, 0); + + RleDecoder decoder(buffer, bytes_written, bit_width); + parity = true; + uint32_t v; + for (int i = 0; i < num_added; ++i) { + bool result = decoder.Get(&v); + EXPECT_TRUE(result); + EXPECT_EQ(v, parity); + parity = !parity; + } + // Make sure we get false when reading past end a couple times. + EXPECT_FALSE(decoder.Get(&v)); + EXPECT_FALSE(decoder.Get(&v)); + } +} + +} // namespace parquet_cpp diff --git a/cpp/src/parquet/util/sse-util.h b/cpp/src/parquet/util/sse-util.h new file mode 100644 index 0000000000000..588c30a07f238 --- /dev/null +++ b/cpp/src/parquet/util/sse-util.h @@ -0,0 +1,191 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// From Apache Impala as of 2016-01-29. Pared down to a minimal set of +// functions needed for parquet-cpp + +#ifndef PARQUET_UTIL_SSE_UTIL_H +#define PARQUET_UTIL_SSE_UTIL_H + +#include + +namespace parquet_cpp { + +/// This class contains constants useful for text processing with SSE4.2 intrinsics. +namespace SSEUtil { + /// Number of characters that fit in 64/128 bit register. SSE provides instructions + /// for loading 64 or 128 bits into a register at a time. + static const int CHARS_PER_64_BIT_REGISTER = 8; + static const int CHARS_PER_128_BIT_REGISTER = 16; + + /// SSE4.2 adds instructions for text processing. The instructions have a control + /// byte that determines some of functionality of the instruction. (Equivalent to + /// GCC's _SIDD_CMP_EQUAL_ANY, etc). + static const int PCMPSTR_EQUAL_ANY = 0x00; // strchr + static const int PCMPSTR_EQUAL_EACH = 0x08; // strcmp + static const int PCMPSTR_UBYTE_OPS = 0x00; // unsigned char (8-bits, rather than 16) + static const int PCMPSTR_NEG_POLARITY = 0x10; // see Intel SDM chapter 4.1.4. + + /// In this mode, SSE text processing functions will return a mask of all the + /// characters that matched. + static const int STRCHR_MODE = PCMPSTR_EQUAL_ANY | PCMPSTR_UBYTE_OPS; + + /// In this mode, SSE text processing functions will return the number of + /// bytes that match consecutively from the beginning. + static const int STRCMP_MODE = PCMPSTR_EQUAL_EACH | PCMPSTR_UBYTE_OPS | + PCMPSTR_NEG_POLARITY; + + /// Precomputed mask values up to 16 bits. + static const int SSE_BITMASK[CHARS_PER_128_BIT_REGISTER] = { + 1 << 0, + 1 << 1, + 1 << 2, + 1 << 3, + 1 << 4, + 1 << 5, + 1 << 6, + 1 << 7, + 1 << 8, + 1 << 9, + 1 << 10, + 1 << 11, + 1 << 12, + 1 << 13, + 1 << 14, + 1 << 15, + }; +} // namespace SSEUtil + +/// Define the SSE 4.2 intrinsics. The caller must first verify at runtime (or codegen +/// IR load time) that the processor supports SSE 4.2 before calling these. These are +/// defined outside the namespace because the IR w/ SSE 4.2 case needs to use macros. +#ifndef IR_COMPILE +/// When compiling to native code (i.e. not IR), we cannot use the -msse4.2 compiler +/// flag. Otherwise, the compiler will emit SSE 4.2 instructions outside of the runtime +/// SSE 4.2 checks and Impala will crash on CPUs that don't support SSE 4.2 +/// (IMPALA-1399/1646). The compiler intrinsics cannot be used without -msse4.2, so we +/// define our own implementations of the intrinsics instead. + +#if defined(__SSE4_1__) || defined(__POPCNT__) +/// Impala native code should not be compiled with -msse4.1 or higher until the minimum +/// CPU requirement is raised to at least the targeted instruction set. +#error "Do not compile with -msse4.1 or higher." +#endif + +/// The PCMPxSTRy instructions require that the control byte 'mode' be encoded as an +/// immediate. So, those need to be always inlined in order to always propagate the +/// mode constant into the inline asm. +#define SSE_ALWAYS_INLINE inline __attribute__ ((__always_inline__)) + +template +static inline __m128i SSE4_cmpestrm(__m128i str1, int len1, __m128i str2, int len2) { + /// Use asm reg rather than Yz output constraint to workaround LLVM bug 13199 - + /// clang doesn't support Y-prefixed asm constraints. + register volatile __m128i result asm("xmm0"); + __asm__ volatile ("pcmpestrm %5, %2, %1" + : "=x"(result) : "x"(str1), "xm"(str2), "a"(len1), "d"(len2), "i"(MODE) : "cc"); + return result; +} + +template +static inline int SSE4_cmpestri(__m128i str1, int len1, __m128i str2, int len2) { + int result; + __asm__("pcmpestri %5, %2, %1" + : "=c"(result) : "x"(str1), "xm"(str2), "a"(len1), "d"(len2), "i"(MODE) : "cc"); + return result; +} + +static inline uint32_t SSE4_crc32_u8(uint32_t crc, uint8_t v) { + __asm__("crc32b %1, %0" : "+r"(crc) : "rm"(v)); + return crc; +} + +static inline uint32_t SSE4_crc32_u32(uint32_t crc, uint32_t v) { + __asm__("crc32l %1, %0" : "+r"(crc) : "rm"(v)); + return crc; +} + +static inline int64_t POPCNT_popcnt_u64(uint64_t a) { + int64_t result; + __asm__("popcntq %1, %0" : "=r"(result) : "mr"(a) : "cc"); + return result; +} + +#undef SSE_ALWAYS_INLINE + +#elif defined(__SSE4_2__) // IR_COMPILE for SSE 4.2. +/// When cross-compiling to IR, we cannot use inline asm because LLVM JIT does not +/// support it. However, the cross-compiled IR is compiled twice: with and without +/// -msse4.2. When -msse4.2 is enabled in the cross-compile, we can just use the +/// compiler intrinsics. + +#include + +template +static inline __m128i SSE4_cmpestrm( + __m128i str1, int len1, __m128i str2, int len2) { + return _mm_cmpestrm(str1, len1, str2, len2, MODE); +} + +template +static inline int SSE4_cmpestri( + __m128i str1, int len1, __m128i str2, int len2) { + return _mm_cmpestri(str1, len1, str2, len2, MODE); +} + +#define SSE4_crc32_u8 _mm_crc32_u8 +#define SSE4_crc32_u32 _mm_crc32_u32 +#define POPCNT_popcnt_u64 _mm_popcnt_u64 + +#else // IR_COMPILE without SSE 4.2. +/// When cross-compiling to IR without SSE 4.2 support (i.e. no -msse4.2), we cannot use +/// SSE 4.2 instructions. Otherwise, the IR loading will fail on CPUs that don't +/// support SSE 4.2. However, because the caller isn't allowed to call these routines +/// on CPUs that lack SSE 4.2 anyway, we can implement stubs for this case. + +template +static inline __m128i SSE4_cmpestrm(__m128i str1, int len1, __m128i str2, int len2) { + DCHECK(false) << "CPU doesn't support SSE 4.2"; + return (__m128i) { 0 }; // NOLINT +} + +template +static inline int SSE4_cmpestri(__m128i str1, int len1, __m128i str2, int len2) { + DCHECK(false) << "CPU doesn't support SSE 4.2"; + return 0; +} + +static inline uint32_t SSE4_crc32_u8(uint32_t crc, uint8_t v) { + DCHECK(false) << "CPU doesn't support SSE 4.2"; + return 0; +} + +static inline uint32_t SSE4_crc32_u32(uint32_t crc, uint32_t v) { + DCHECK(false) << "CPU doesn't support SSE 4.2"; + return 0; +} + +static inline int64_t POPCNT_popcnt_u64(uint64_t a) { + DCHECK(false) << "CPU doesn't support SSE 4.2"; + return 0; +} + +#endif + +} // namespace parquet_cpp + +#endif // PARQUET_UTIL_SSE_UTIL_H