diff --git a/cpp/src/parquet/encodings/encodings.h b/cpp/src/parquet/encodings/encodings.h
index 50b6291adfbfb..b30146a2c5919 100644
--- a/cpp/src/parquet/encodings/encodings.h
+++ b/cpp/src/parquet/encodings/encodings.h
@@ -72,8 +72,9 @@ class Decoder {
 #include "parquet/encodings/plain-encoding.h"
 #include "parquet/encodings/dictionary-encoding.h"
 
-#include "parquet/encodings/delta-bit-pack-encoding.h"
-#include "parquet/encodings/delta-length-byte-array-encoding.h"
-#include "parquet/encodings/delta-byte-array-encoding.h"
+// The encoding tools changed and these are missing the ZigZag functions
+// #include "parquet/encodings/delta-bit-pack-encoding.h"
+// #include "parquet/encodings/delta-length-byte-array-encoding.h"
+// #include "parquet/encodings/delta-byte-array-encoding.h"
 
 #endif // PARQUET_ENCODINGS_ENCODINGS_H
diff --git a/cpp/src/parquet/util/CMakeLists.txt b/cpp/src/parquet/util/CMakeLists.txt
index 8c15de1d1afde..1c86112ddec19 100644
--- a/cpp/src/parquet/util/CMakeLists.txt
+++ b/cpp/src/parquet/util/CMakeLists.txt
@@ -20,6 +20,8 @@ install(FILES
   bit-stream-utils.h
   bit-stream-utils.inline.h
   bit-util.h
+  cpu-info.h
+  sse-info.h
   compiler-util.h
   logging.h
   rle-encoding.h
@@ -29,6 +31,7 @@ install(FILES
 
 add_library(parquet_util STATIC
   input_stream.cc
+  cpu-info.cc
 )
 
 add_library(parquet_test_main
@@ -47,3 +50,6 @@ else()
     pthread
   )
 endif()
+
+ADD_PARQUET_TEST(bit-util-test)
+ADD_PARQUET_TEST(rle-test)
diff --git a/cpp/src/parquet/util/bit-stream-utils.h b/cpp/src/parquet/util/bit-stream-utils.h
index 97ba71b833eb1..a02839dc3b438 100644
--- a/cpp/src/parquet/util/bit-stream-utils.h
+++ b/cpp/src/parquet/util/bit-stream-utils.h
@@ -15,26 +15,28 @@
 // specific language governing permissions and limitations
 // under the License.
 
+// From Apache Impala as of 2016-01-29
+
 #ifndef PARQUET_UTIL_BIT_STREAM_UTILS_H
 #define PARQUET_UTIL_BIT_STREAM_UTILS_H
 
-#include <string.h>
 #include <algorithm>
 #include <cstdint>
+#include <string.h>
 
 #include "parquet/util/compiler-util.h"
-#include "parquet/util/bit-util.h"
 #include "parquet/util/logging.h"
+#include "parquet/util/bit-util.h"
 
 namespace parquet_cpp {
 
-// Utility class to write bit/byte streams.  This class can write data to either be
-// bit packed or byte aligned (and a single stream that has a mix of both).
-// This class does not allocate memory.
+/// Utility class to write bit/byte streams.  This class can write data to either be
+/// bit packed or byte aligned (and a single stream that has a mix of both).
+/// This class does not allocate memory.
 class BitWriter {
  public:
-  // buffer: buffer to write bits to.  Buffer should be preallocated with
-  // 'buffer_len' bytes.
+  /// buffer: buffer to write bits to.  Buffer should be preallocated with
+  /// 'buffer_len' bytes.
   BitWriter(uint8_t* buffer, int buffer_len) :
       buffer_(buffer),
       max_bytes_(buffer_len) {
@@ -47,56 +49,56 @@ class BitWriter {
     bit_offset_ = 0;
   }
 
-  // The number of current bytes written, including the current byte (i.e. may include a
-  // fraction of a byte). Includes buffered values.
+  /// The number of current bytes written, including the current byte (i.e. may include a
+  /// fraction of a byte). Includes buffered values.
   int bytes_written() const { return byte_offset_ + BitUtil::Ceil(bit_offset_, 8); }
   uint8_t* buffer() const { return buffer_; }
   int buffer_len() const { return max_bytes_; }
 
-  // Writes a value to buffered_values_, flushing to buffer_ if necessary.  This is bit
-  // packed.  Returns false if there was not enough space. num_bits must be <= 32.
+  /// Writes a value to buffered_values_, flushing to buffer_ if necessary.  This is bit
+  /// packed.  Returns false if there was not enough space. num_bits must be <= 32.
   bool PutValue(uint64_t v, int num_bits);
 
-  // Writes v to the next aligned byte using num_bytes. If T is larger than num_bytes, the
-  // extra high-order bytes will be ignored. Returns false if there was not enough space.
+  /// Writes v to the next aligned byte using num_bytes. If T is larger than
+  /// num_bytes, the extra high-order bytes will be ignored. Returns false if
+  /// there was not enough space.
   template<typename T>
   bool PutAligned(T v, int num_bytes);
 
-  // Write a Vlq encoded int to the buffer.  Returns false if there was not enough
-  // room.  The value is written byte aligned.
-  // For more details on vlq:
-  // en.wikipedia.org/wiki/Variable-length_quantity
-  bool PutVlqInt(uint32_t v);
-  bool PutZigZagVlqInt(int32_t v);
+  /// Write a Vlq encoded int to the buffer.  Returns false if there was not enough
+  /// room.  The value is written byte aligned.
+  /// For more details on vlq:
+  /// en.wikipedia.org/wiki/Variable-length_quantity
+  bool PutVlqInt(int32_t v);
 
-  // Get a pointer to the next aligned byte and advance the underlying buffer
-  // by num_bytes.
-  // Returns NULL if there was not enough space.
+  /// Get a pointer to the next aligned byte and advance the underlying buffer
+  /// by num_bytes.
+  /// Returns NULL if there was not enough space.
   uint8_t* GetNextBytePtr(int num_bytes = 1);
 
-  // Flushes all buffered values to the buffer. Call this when done writing to the buffer.
-  // If 'align' is true, buffered_values_ is reset and any future writes will be written
-  // to the next byte boundary.
+  /// Flushes all buffered values to the buffer. Call this when done writing to
+  /// the buffer.  If 'align' is true, buffered_values_ is reset and any future
+  /// writes will be written to the next byte boundary.
   void Flush(bool align = false);
 
  private:
   uint8_t* buffer_;
   int max_bytes_;
 
-  // Bit-packed values are initially written to this variable before being memcpy'd to
-  // buffer_. This is faster than writing values byte by byte directly to buffer_.
+  /// Bit-packed values are initially written to this variable before being memcpy'd to
+  /// buffer_. This is faster than writing values byte by byte directly to buffer_.
   uint64_t buffered_values_;
 
   int byte_offset_;       // Offset in buffer_
   int bit_offset_;        // Offset in buffered_values_
 };
 
-// Utility class to read bit/byte stream.  This class can read bits or bytes
-// that are either byte aligned or not.  It also has utilities to read multiple
-// bytes in one read (e.g. encoded int).
+/// Utility class to read bit/byte stream.  This class can read bits or bytes
+/// that are either byte aligned or not.  It also has utilities to read multiple
+/// bytes in one read (e.g. encoded int).
 class BitReader {
  public:
-  // 'buffer' is the buffer to read from.  The buffer's length is 'buffer_len'.
+  /// 'buffer' is the buffer to read from.  The buffer's length is 'buffer_len'.
   BitReader(const uint8_t* buffer, int buffer_len) :
       buffer_(buffer),
       max_bytes_(buffer_len),
@@ -108,36 +110,48 @@ class BitReader {
 
   BitReader() : buffer_(NULL), max_bytes_(0) {}
 
-  // Gets the next value from the buffer.  Returns true if 'v' could be read or false if
-  // there are not enough bytes left. num_bits must be <= 32.
+  void Reset(const uint8_t* buffer, int buffer_len) {
+    buffer_ = buffer;
+    max_bytes_ = buffer_len;
+    byte_offset_ = 0;
+    bit_offset_ = 0;
+  }
+
+  /// Gets the next value from the buffer.  Returns true if 'v' could be read or false if
+  /// there are not enough bytes left. num_bits must be <= 32.
   template<typename T>
   bool GetValue(int num_bits, T* v);
 
-  // Reads a 'num_bytes'-sized value from the buffer and stores it in 'v'. T needs to be a
-  // little-endian native type and big enough to store 'num_bytes'. The value is assumed
-  // to be byte-aligned so the stream will be advanced to the start of the next byte
-  // before 'v' is read. Returns false if there are not enough bytes left.
+  /// Reads a 'num_bytes'-sized value from the buffer and stores it in 'v'. T
+  /// needs to be a little-endian native type and big enough to store
+  /// 'num_bytes'. The value is assumed to be byte-aligned so the stream will
+  /// be advanced to the start of the next byte before 'v' is read. Returns
+  /// false if there are not enough bytes left.
   template<typename T>
   bool GetAligned(int num_bytes, T* v);
 
-  // Reads a vlq encoded int from the stream.  The encoded int must start at the
-  // beginning of a byte. Return false if there were not enough bytes in the buffer.
-  bool GetVlqInt(uint64_t* v);
-  bool GetZigZagVlqInt(int64_t* v);
+  /// Reads a vlq encoded int from the stream.  The encoded int must start at
+  /// the beginning of a byte. Return false if there were not enough bytes in
+  /// the buffer.
+  bool GetVlqInt(int32_t* v);
 
-  // Returns the number of bytes left in the stream, not including the current byte (i.e.,
-  // there may be an additional fraction of a byte).
+  /// Returns the number of bytes left in the stream, not including the current
+  /// byte (i.e., there may be an additional fraction of a byte).
   int bytes_left() { return max_bytes_ - (byte_offset_ + BitUtil::Ceil(bit_offset_, 8)); }
 
-  // Maximum byte length of a vlq encoded int
+  /// Maximum byte length of a vlq encoded int
   static const int MAX_VLQ_BYTE_LEN = 5;
 
+  // TODO(nongli): implementations to be fixed given changes in Impala
+  // bool GetZigZagVlqInt(int64_t* v);
+  // bool PutZigZagVlqInt(int32_t v);
+
  private:
   const uint8_t* buffer_;
   int max_bytes_;
 
-  // Bytes are memcpy'd from buffer_ and values are read from this variable. This is
-  // faster than reading values byte by byte directly from buffer_.
+  /// Bytes are memcpy'd from buffer_ and values are read from this variable. This is
+  /// faster than reading values byte by byte directly from buffer_.
   uint64_t buffered_values_;
 
   int byte_offset_;       // Offset in buffer_
@@ -146,4 +160,4 @@ class BitReader {
 
 } // namespace parquet_cpp
 
-#endif
+#endif // PARQUET_UTIL_BIT_STREAM_UTILS_H
diff --git a/cpp/src/parquet/util/bit-stream-utils.inline.h b/cpp/src/parquet/util/bit-stream-utils.inline.h
index 6171b1fb04f6a..77e2d48817110 100644
--- a/cpp/src/parquet/util/bit-stream-utils.inline.h
+++ b/cpp/src/parquet/util/bit-stream-utils.inline.h
@@ -15,6 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
+// From Apache Impala as of 2016-01-29
+
 #ifndef PARQUET_UTIL_BIT_STREAM_UTILS_INLINE_H
 #define PARQUET_UTIL_BIT_STREAM_UTILS_INLINE_H
 
@@ -73,7 +75,7 @@ inline bool BitWriter::PutAligned(T val, int num_bytes) {
   return true;
 }
 
-inline bool BitWriter::PutVlqInt(uint32_t v) {
+inline bool BitWriter::PutVlqInt(int32_t v) {
   bool result = true;
   while ((v & 0xFFFFFF80) != 0L) {
     result &= PutAligned<uint8_t>((v & 0x7F) | 0x80, 1);
@@ -83,13 +85,9 @@ inline bool BitWriter::PutVlqInt(uint32_t v) {
   return result;
 }
 
-inline bool BitWriter::PutZigZagVlqInt(int32_t v) {
-  uint32_t u = (v << 1) ^ (v >> 31);
-  return PutVlqInt(u);
-}
-
 template<typename T>
 inline bool BitReader::GetValue(int num_bits, T* v) {
+  DCHECK(buffer_ != NULL);
   // TODO: revisit this limit if necessary
   DCHECK_LE(num_bits, 32);
   DCHECK_LE(num_bits, sizeof(T) * 8);
@@ -140,7 +138,7 @@ inline bool BitReader::GetAligned(int num_bytes, T* v) {
   return true;
 }
 
-inline bool BitReader::GetVlqInt(uint64_t* v) {
+inline bool BitReader::GetVlqInt(int32_t* v) {
   *v = 0;
   int shift = 0;
   int num_bytes = 0;
@@ -154,12 +152,20 @@ inline bool BitReader::GetVlqInt(uint64_t* v) {
   return true;
 }
 
-inline bool BitReader::GetZigZagVlqInt(int64_t* v) {
-  uint64_t u;
-  if (!GetVlqInt(&u)) return false;
-  *reinterpret_cast<uint64_t*>(v) = (u >> 1) ^ -(u & 1);
-  return true;
-}
+// TODO(nongli): review/test these implementations given divergence in Impala
+// functions
+
+// inline bool BitWriter::PutZigZagVlqInt(int32_t v) {
+//   uint32_t u = (v << 1) ^ (v >> 31);
+//   return PutVlqInt(u);
+// }
+
+// inline bool BitReader::GetZigZagVlqInt(int64_t* v) {
+//   uint64_t u;
+//   if (!GetVlqInt(&u)) return false;
+//   *reinterpret_cast<uint64_t*>(v) = (u >> 1) ^ -(u & 1);
+//   return true;
+// }
 
 } // namespace parquet_cpp
 
diff --git a/cpp/src/parquet/util/bit-util-test.cc b/cpp/src/parquet/util/bit-util-test.cc
new file mode 100644
index 0000000000000..78efe1a85536e
--- /dev/null
+++ b/cpp/src/parquet/util/bit-util-test.cc
@@ -0,0 +1,164 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// From Apache Impala as of 2016-01-29
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <iostream>
+#include <limits.h>
+
+#include <boost/utility.hpp>
+#include <gtest/gtest.h>
+
+#include "parquet/util/bit-util.h"
+#include "parquet/util/cpu-info.h"
+
+namespace parquet_cpp {
+
+TEST(BitUtil, Ceil) {
+  EXPECT_EQ(BitUtil::Ceil(0, 1), 0);
+  EXPECT_EQ(BitUtil::Ceil(1, 1), 1);
+  EXPECT_EQ(BitUtil::Ceil(1, 2), 1);
+  EXPECT_EQ(BitUtil::Ceil(1, 8), 1);
+  EXPECT_EQ(BitUtil::Ceil(7, 8), 1);
+  EXPECT_EQ(BitUtil::Ceil(8, 8), 1);
+  EXPECT_EQ(BitUtil::Ceil(9, 8), 2);
+  EXPECT_EQ(BitUtil::Ceil(9, 9), 1);
+  EXPECT_EQ(BitUtil::Ceil(10000000000, 10), 1000000000);
+  EXPECT_EQ(BitUtil::Ceil(10, 10000000000), 1);
+  EXPECT_EQ(BitUtil::Ceil(100000000000, 10000000000), 10);
+}
+
+TEST(BitUtil, RoundUp) {
+  EXPECT_EQ(BitUtil::RoundUp(0, 1), 0);
+  EXPECT_EQ(BitUtil::RoundUp(1, 1), 1);
+  EXPECT_EQ(BitUtil::RoundUp(1, 2), 2);
+  EXPECT_EQ(BitUtil::RoundUp(6, 2), 6);
+  EXPECT_EQ(BitUtil::RoundUp(7, 3), 9);
+  EXPECT_EQ(BitUtil::RoundUp(9, 9), 9);
+  EXPECT_EQ(BitUtil::RoundUp(10000000001, 10), 10000000010);
+  EXPECT_EQ(BitUtil::RoundUp(10, 10000000000), 10000000000);
+  EXPECT_EQ(BitUtil::RoundUp(100000000000, 10000000000), 100000000000);
+}
+
+TEST(BitUtil, RoundDown) {
+  EXPECT_EQ(BitUtil::RoundDown(0, 1), 0);
+  EXPECT_EQ(BitUtil::RoundDown(1, 1), 1);
+  EXPECT_EQ(BitUtil::RoundDown(1, 2), 0);
+  EXPECT_EQ(BitUtil::RoundDown(6, 2), 6);
+  EXPECT_EQ(BitUtil::RoundDown(7, 3), 6);
+  EXPECT_EQ(BitUtil::RoundDown(9, 9), 9);
+  EXPECT_EQ(BitUtil::RoundDown(10000000001, 10), 10000000000);
+  EXPECT_EQ(BitUtil::RoundDown(10, 10000000000), 0);
+  EXPECT_EQ(BitUtil::RoundDown(100000000000, 10000000000), 100000000000);
+}
+
+TEST(BitUtil, Popcount) {
+  EXPECT_EQ(BitUtil::Popcount(BOOST_BINARY(0 1 0 1 0 1 0 1)), 4);
+  EXPECT_EQ(BitUtil::PopcountNoHw(BOOST_BINARY(0 1 0 1 0 1 0 1)), 4);
+  EXPECT_EQ(BitUtil::Popcount(BOOST_BINARY(1 1 1 1 0 1 0 1)), 6);
+  EXPECT_EQ(BitUtil::PopcountNoHw(BOOST_BINARY(1 1 1 1 0 1 0 1)), 6);
+  EXPECT_EQ(BitUtil::Popcount(BOOST_BINARY(1 1 1 1 1 1 1 1)), 8);
+  EXPECT_EQ(BitUtil::PopcountNoHw(BOOST_BINARY(1 1 1 1 1 1 1 1)), 8);
+  EXPECT_EQ(BitUtil::Popcount(0), 0);
+  EXPECT_EQ(BitUtil::PopcountNoHw(0), 0);
+}
+
+TEST(BitUtil, TrailingBits) {
+  EXPECT_EQ(BitUtil::TrailingBits(BOOST_BINARY(1 1 1 1 1 1 1 1), 0), 0);
+  EXPECT_EQ(BitUtil::TrailingBits(BOOST_BINARY(1 1 1 1 1 1 1 1), 1), 1);
+  EXPECT_EQ(BitUtil::TrailingBits(BOOST_BINARY(1 1 1 1 1 1 1 1), 64),
+            BOOST_BINARY(1 1 1 1 1 1 1 1));
+  EXPECT_EQ(BitUtil::TrailingBits(BOOST_BINARY(1 1 1 1 1 1 1 1), 100),
+            BOOST_BINARY(1 1 1 1 1 1 1 1));
+  EXPECT_EQ(BitUtil::TrailingBits(0, 1), 0);
+  EXPECT_EQ(BitUtil::TrailingBits(0, 64), 0);
+  EXPECT_EQ(BitUtil::TrailingBits(1LL << 63, 0), 0);
+  EXPECT_EQ(BitUtil::TrailingBits(1LL << 63, 63), 0);
+  EXPECT_EQ(BitUtil::TrailingBits(1LL << 63, 64), 1LL << 63);
+}
+
+TEST(BitUtil, ByteSwap) {
+  EXPECT_EQ(BitUtil::ByteSwap(static_cast<uint32_t>(0)), 0);
+  EXPECT_EQ(BitUtil::ByteSwap(static_cast<uint32_t>(0x11223344)), 0x44332211);
+
+  EXPECT_EQ(BitUtil::ByteSwap(static_cast<int32_t>(0)), 0);
+  EXPECT_EQ(BitUtil::ByteSwap(static_cast<int32_t>(0x11223344)), 0x44332211);
+
+  EXPECT_EQ(BitUtil::ByteSwap(static_cast<uint64_t>(0)), 0);
+  EXPECT_EQ(BitUtil::ByteSwap(
+      static_cast<uint64_t>(0x1122334455667788)), 0x8877665544332211);
+
+  EXPECT_EQ(BitUtil::ByteSwap(static_cast<int64_t>(0)), 0);
+  EXPECT_EQ(BitUtil::ByteSwap(
+      static_cast<int64_t>(0x1122334455667788)), 0x8877665544332211);
+
+  EXPECT_EQ(BitUtil::ByteSwap(static_cast<int16_t>(0)), 0);
+  EXPECT_EQ(BitUtil::ByteSwap(static_cast<int16_t>(0x1122)), 0x2211);
+
+  EXPECT_EQ(BitUtil::ByteSwap(static_cast<uint16_t>(0)), 0);
+  EXPECT_EQ(BitUtil::ByteSwap(static_cast<uint16_t>(0x1122)), 0x2211);
+}
+
+TEST(BitUtil, Log2) {
+  EXPECT_EQ(BitUtil::Log2(1), 0);
+  EXPECT_EQ(BitUtil::Log2(2), 1);
+  EXPECT_EQ(BitUtil::Log2(3), 2);
+  EXPECT_EQ(BitUtil::Log2(4), 2);
+  EXPECT_EQ(BitUtil::Log2(5), 3);
+  EXPECT_EQ(BitUtil::Log2(INT_MAX), 31);
+  EXPECT_EQ(BitUtil::Log2(UINT_MAX), 32);
+  EXPECT_EQ(BitUtil::Log2(ULLONG_MAX), 64);
+}
+
+TEST(BitUtil, RoundUpToPowerOf2) {
+  EXPECT_EQ(BitUtil::RoundUpToPowerOf2(7, 8), 8);
+  EXPECT_EQ(BitUtil::RoundUpToPowerOf2(8, 8), 8);
+  EXPECT_EQ(BitUtil::RoundUpToPowerOf2(9, 8), 16);
+}
+
+TEST(BitUtil, RoundDownToPowerOf2) {
+  EXPECT_EQ(BitUtil::RoundDownToPowerOf2(7, 8), 0);
+  EXPECT_EQ(BitUtil::RoundDownToPowerOf2(8, 8), 8);
+  EXPECT_EQ(BitUtil::RoundDownToPowerOf2(9, 8), 8);
+}
+
+TEST(BitUtil, RoundUpDown) {
+  EXPECT_EQ(BitUtil::RoundUpNumBytes(7), 1);
+  EXPECT_EQ(BitUtil::RoundUpNumBytes(8), 1);
+  EXPECT_EQ(BitUtil::RoundUpNumBytes(9), 2);
+  EXPECT_EQ(BitUtil::RoundDownNumBytes(7), 0);
+  EXPECT_EQ(BitUtil::RoundDownNumBytes(8), 1);
+  EXPECT_EQ(BitUtil::RoundDownNumBytes(9), 1);
+
+  EXPECT_EQ(BitUtil::RoundUpNumi32(31), 1);
+  EXPECT_EQ(BitUtil::RoundUpNumi32(32), 1);
+  EXPECT_EQ(BitUtil::RoundUpNumi32(33), 2);
+  EXPECT_EQ(BitUtil::RoundDownNumi32(31), 0);
+  EXPECT_EQ(BitUtil::RoundDownNumi32(32), 1);
+  EXPECT_EQ(BitUtil::RoundDownNumi32(33), 1);
+
+  EXPECT_EQ(BitUtil::RoundUpNumi64(63), 1);
+  EXPECT_EQ(BitUtil::RoundUpNumi64(64), 1);
+  EXPECT_EQ(BitUtil::RoundUpNumi64(65), 2);
+  EXPECT_EQ(BitUtil::RoundDownNumi64(63), 0);
+  EXPECT_EQ(BitUtil::RoundDownNumi64(64), 1);
+  EXPECT_EQ(BitUtil::RoundDownNumi64(65), 1);
+}
+
+} // namespace parquet_cpp
diff --git a/cpp/src/parquet/util/bit-util.h b/cpp/src/parquet/util/bit-util.h
index 593d1c2cfe447..4db585a0ccc67 100644
--- a/cpp/src/parquet/util/bit-util.h
+++ b/cpp/src/parquet/util/bit-util.h
@@ -15,47 +15,132 @@
 // specific language governing permissions and limitations
 // under the License.
 
+// From Apache Impala as of 2016-01-29
+
 #ifndef PARQUET_UTIL_BIT_UTIL_H
 #define PARQUET_UTIL_BIT_UTIL_H
 
 #if defined(__APPLE__)
-  #include <machine/endian.h>
+#include <machine/endian.h>
 #else
-  #include <endian.h>
+#include <endian.h>
 #endif
 
+#include <boost/type_traits/make_unsigned.hpp>
+
 #include "parquet/util/compiler-util.h"
-#include "parquet/util/logging.h"
+#include "parquet/util/cpu-info.h"
+#include "parquet/util/sse-util.h"
 
 namespace parquet_cpp {
 
-// Utility class to do standard bit tricks
-// TODO: is this in boost or something else like that?
+using boost::make_unsigned;
+
+/// Utility class to do standard bit tricks
+/// TODO: is this in boost or something else like that?
 class BitUtil {
  public:
-  // Returns the ceil of value/divisor
-  static inline int Ceil(int value, int divisor) {
+  /// Returns the ceil of value/divisor
+  static inline int64_t Ceil(int64_t value, int64_t divisor) {
     return value / divisor + (value % divisor != 0);
   }
 
-  // Returns 'value' rounded up to the nearest multiple of 'factor'
-  static inline int RoundUp(int value, int factor) {
+  /// Returns 'value' rounded up to the nearest multiple of 'factor'
+  static inline int64_t RoundUp(int64_t value, int64_t factor) {
     return (value + (factor - 1)) / factor * factor;
   }
 
-  // Returns 'value' rounded down to the nearest multiple of 'factor'
-  static inline int RoundDown(int value, int factor) {
+  /// Returns 'value' rounded down to the nearest multiple of 'factor'
+  static inline int64_t RoundDown(int64_t value, int64_t factor) {
     return (value / factor) * factor;
   }
 
-  // Returns the number of set bits in x
-  static inline int Popcount(uint64_t x) {
+  /// Returns the smallest power of two that contains v. Taken from
+  /// http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
+  /// TODO: Pick a better name, as it is not clear what happens when the input is
+  /// already a power of two.
+  static inline int64_t NextPowerOfTwo(int64_t v) {
+    --v;
+    v |= v >> 1;
+    v |= v >> 2;
+    v |= v >> 4;
+    v |= v >> 8;
+    v |= v >> 16;
+    v |= v >> 32;
+    ++v;
+    return v;
+  }
+
+  /// Returns 'value' rounded up to the nearest multiple of 'factor' when factor is
+  /// a power of two
+  static inline int RoundUpToPowerOf2(int value, int factor) {
+    DCHECK((factor > 0) && ((factor & (factor - 1)) == 0));
+    return (value + (factor - 1)) & ~(factor - 1);
+  }
+
+  static inline int RoundDownToPowerOf2(int value, int factor) {
+    DCHECK((factor > 0) && ((factor & (factor - 1)) == 0));
+    return value & ~(factor - 1);
+  }
+
+  /// Specialized round up and down functions for frequently used factors,
+  /// like 8 (bits->bytes), 32 (bits->i32), and 64 (bits->i64).
+  /// Returns the rounded up number of bytes that fit the number of bits.
+  static inline uint32_t RoundUpNumBytes(uint32_t bits) {
+    return (bits + 7) >> 3;
+  }
+
+  /// Returns the rounded down number of bytes that fit the number of bits.
+  static inline uint32_t RoundDownNumBytes(uint32_t bits) {
+    return bits >> 3;
+  }
+
+  /// Returns the rounded up to 32 multiple. Used for conversions of bits to i32.
+  static inline uint32_t RoundUpNumi32(uint32_t bits) {
+    return (bits + 31) >> 5;
+  }
+
+  /// Returns the rounded up 32 multiple.
+  static inline uint32_t RoundDownNumi32(uint32_t bits) {
+    return bits >> 5;
+  }
+
+  /// Returns the rounded up to 64 multiple. Used for conversions of bits to i64.
+  static inline uint32_t RoundUpNumi64(uint32_t bits) {
+    return (bits + 63) >> 6;
+  }
+
+  /// Returns the rounded down to 64 multiple.
+  static inline uint32_t RoundDownNumi64(uint32_t bits) {
+    return bits >> 6;
+  }
+
+  /// Non hw accelerated pop count.
+  /// TODO: we don't use this in any perf sensitive code paths currently.  There
+  /// might be a much faster way to implement this.
+  static inline int PopcountNoHw(uint64_t x) {
     int count = 0;
     for (; x != 0; ++count) x &= x-1;
     return count;
   }
 
-  // Returns the 'num_bits' least-significant bits of 'v'.
+  /// Returns the number of set bits in x
+  static inline int Popcount(uint64_t x) {
+    if (LIKELY(CpuInfo::IsSupported(CpuInfo::POPCNT))) {
+      return POPCNT_popcnt_u64(x);
+    } else {
+      return PopcountNoHw(x);
+    }
+  }
+
+  // Compute correct population count for various-width signed integers
+  template<typename T>
+  static inline int PopcountSigned(T v) {
+    // Converting to same-width unsigned then extending preserves the bit pattern.
+    return BitUtil::Popcount(static_cast<typename make_unsigned<T>::type>(v));
+  }
+
+  /// Returns the 'num_bits' least-significant bits of 'v'.
   static inline uint64_t TrailingBits(uint64_t v, int num_bits) {
     if (UNLIKELY(num_bits == 0)) return 0;
     if (UNLIKELY(num_bits >= 64)) return v;
@@ -63,11 +148,12 @@ class BitUtil {
     return (v << n) >> n;
   }
 
-  // Returns ceil(log2(x)).
-  // TODO: this could be faster if we use __builtin_clz.  Fix this if this ever shows up
-  // in a hot path.
+  /// Returns ceil(log2(x)).
+  /// TODO: this could be faster if we use __builtin_clz.  Fix this if this ever shows up
+  /// in a hot path.
   static inline int Log2(uint64_t x) {
-    if (x == 0) return 0;
+    DCHECK_GT(x, 0);
+    if (x == 1) return 0;
     // Compute result = ceil(log2(x))
     //                = floor(log2(x - 1)) + 1, for x > 1
     // by finding the position of the most significant bit (1-indexed) of x - 1
@@ -78,15 +164,7 @@ class BitUtil {
     return result;
   }
 
-  // Returns the minimum number of bits needed to represent the value of 'x'
-  static inline int NumRequiredBits(uint64_t x) {
-    for (int i = 63; i >= 0; --i) {
-      if (x & 1L << i) return i + 1;
-    }
-    return 0;
-  }
-
-  // Swaps the byte order (i.e. endianess)
+  /// Swaps the byte order (i.e. endianess)
   static inline int64_t ByteSwap(int64_t value) {
     return __builtin_bswap64(value);
   }
@@ -106,7 +184,7 @@ class BitUtil {
     return static_cast<uint16_t>(ByteSwap(static_cast<int16_t>(value)));
   }
 
-  // Write the swapped bytes into dst. Src and st cannot overlap.
+  /// Write the swapped bytes into dst. Src and st cannot overlap.
   static inline void ByteSwap(void* dst, const void* src, int len) {
     switch (len) {
       case 1:
@@ -134,8 +212,8 @@ class BitUtil {
     }
   }
 
-  // Converts to big endian format (if not already in big endian) from the
-  // machine's native endian format.
+  /// Converts to big endian format (if not already in big endian) from the
+  /// machine's native endian format.
 #if __BYTE_ORDER == __LITTLE_ENDIAN
   static inline int64_t  ToBigEndian(int64_t value)  { return ByteSwap(value); }
   static inline uint64_t ToBigEndian(uint64_t value) { return ByteSwap(value); }
@@ -152,7 +230,7 @@ class BitUtil {
   static inline uint16_t ToBigEndian(uint16_t val) { return val; }
 #endif
 
-  // Converts from big endian format to the machine's native endian format.
+  /// Converts from big endian format to the machine's native endian format.
 #if __BYTE_ORDER == __LITTLE_ENDIAN
   static inline int64_t  FromBigEndian(int64_t value)  { return ByteSwap(value); }
   static inline uint64_t FromBigEndian(uint64_t value) { return ByteSwap(value); }
@@ -168,6 +246,36 @@ class BitUtil {
   static inline int16_t  FromBigEndian(int16_t val)  { return val; }
   static inline uint16_t FromBigEndian(uint16_t val) { return val; }
 #endif
+
+  // Logical right shift for signed integer types
+  // This is needed because the C >> operator does arithmetic right shift
+  // Negative shift amounts lead to undefined behavior
+  template<typename T>
+  static T ShiftRightLogical(T v, int shift) {
+    // Conversion to unsigned ensures most significant bits always filled with 0's
+    return static_cast<typename make_unsigned<T>::type>(v) >> shift;
+  }
+
+  // Get an specific bit of a numeric type
+  template<typename T>
+  static inline int8_t GetBit(T v, int bitpos) {
+    T masked = v & (static_cast<T>(0x1) << bitpos);
+    return static_cast<int8_t>(ShiftRightLogical(masked, bitpos));
+  }
+
+  // Set a specific bit to 1
+  // Behavior when bitpos is negative is undefined
+  template<typename T>
+  static T SetBit(T v, int bitpos) {
+    return v | (static_cast<T>(0x1) << bitpos);
+  }
+
+  // Set a specific bit to 0
+  // Behavior when bitpos is negative is undefined
+  template<typename T>
+  static T UnsetBit(T v, int bitpos) {
+    return v & ~(static_cast<T>(0x1) << bitpos);
+  }
 };
 
 } // namespace parquet_cpp
diff --git a/cpp/src/parquet/util/cpu-info.cc b/cpp/src/parquet/util/cpu-info.cc
new file mode 100644
index 0000000000000..610fb623ed042
--- /dev/null
+++ b/cpp/src/parquet/util/cpu-info.cc
@@ -0,0 +1,169 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// From Apache Impala as of 2016-01-29. Pared down to a minimal set of
+// functions needed for parquet-cpp
+
+#include "parquet/util/cpu-info.h"
+
+#ifdef __APPLE__
+#include <sys/sysctl.h>
+#endif
+
+#include <boost/algorithm/string.hpp>
+#include <algorithm>
+#include <cstdint>
+#include <iostream>
+#include <fstream>
+#include <mmintrin.h>
+#include <sstream>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <string>
+
+#include "parquet/exception.h"
+
+using boost::algorithm::contains;
+using boost::algorithm::trim;
+using std::max;
+using std::string;
+
+namespace parquet_cpp {
+
+bool CpuInfo::initialized_ = false;
+int64_t CpuInfo::hardware_flags_ = 0;
+int64_t CpuInfo::original_hardware_flags_;
+int64_t CpuInfo::cache_sizes_[L3_CACHE + 1];
+int64_t CpuInfo::cycles_per_ms_;
+int CpuInfo::num_cores_ = 1;
+string CpuInfo::model_name_ = "unknown"; // NOLINT
+
+static struct {
+  string name;
+  int64_t flag;
+} flag_mappings[] = {
+  { "ssse3",  CpuInfo::SSSE3 },
+  { "sse4_1", CpuInfo::SSE4_1 },
+  { "sse4_2", CpuInfo::SSE4_2 },
+  { "popcnt", CpuInfo::POPCNT },
+};
+static const int64_t num_flags = sizeof(flag_mappings) / sizeof(flag_mappings[0]);
+
+// Helper function to parse for hardware flags.
+// values contains a list of space-seperated flags.  check to see if the flags we
+// care about are present.
+// Returns a bitmap of flags.
+int64_t ParseCPUFlags(const string& values) {
+  int64_t flags = 0;
+  for (int i = 0; i < num_flags; ++i) {
+    if (contains(values, flag_mappings[i].name)) {
+      flags |= flag_mappings[i].flag;
+    }
+  }
+  return flags;
+}
+
+void CpuInfo::Init() {
+  string line;
+  string name;
+  string value;
+
+  float max_mhz = 0;
+  int num_cores = 0;
+
+  memset(&cache_sizes_, 0, sizeof(cache_sizes_));
+
+  // Read from /proc/cpuinfo
+  std::ifstream cpuinfo("/proc/cpuinfo", std::ios::in);
+  while (cpuinfo) {
+    getline(cpuinfo, line);
+    size_t colon = line.find(':');
+    if (colon != string::npos) {
+      name = line.substr(0, colon - 1);
+      value = line.substr(colon + 1, string::npos);
+      trim(name);
+      trim(value);
+      if (name.compare("flags") == 0) {
+        hardware_flags_ |= ParseCPUFlags(value);
+      } else if (name.compare("cpu MHz") == 0) {
+        // Every core will report a different speed.  We'll take the max, assuming
+        // that when impala is running, the core will not be in a lower power state.
+        // TODO: is there a more robust way to do this, such as
+        // Window's QueryPerformanceFrequency()
+        float mhz = atof(value.c_str());
+        max_mhz = max(mhz, max_mhz);
+      } else if (name.compare("processor") == 0) {
+        ++num_cores;
+      } else if (name.compare("model name") == 0) {
+        model_name_ = value;
+      }
+    }
+  }
+  if (cpuinfo.is_open()) cpuinfo.close();
+
+#ifdef __APPLE__
+  // On Mac OS X use sysctl() to get the cache sizes
+  size_t len = 0;
+  sysctlbyname("hw.cachesize", NULL, &len, NULL, 0);
+  uint64_t* data = static_cast<uint64_t*>(malloc(len));
+  sysctlbyname("hw.cachesize", data, &len, NULL, 0);
+  DCHECK(len / sizeof(uint64_t) >= 3);
+  for (size_t i = 0; i < 3; ++i) {
+    cache_sizes_[i] = data[i];
+  }
+#else
+  // Call sysconf to query for the cache sizes
+  cache_sizes_[0] = sysconf(_SC_LEVEL1_DCACHE_SIZE);
+  cache_sizes_[1] = sysconf(_SC_LEVEL2_CACHE_SIZE);
+  cache_sizes_[2] = sysconf(_SC_LEVEL3_CACHE_SIZE);
+#endif
+
+  if (max_mhz != 0) {
+    cycles_per_ms_ = max_mhz * 1000;
+  } else {
+    cycles_per_ms_ = 1000000;
+  }
+  original_hardware_flags_ = hardware_flags_;
+
+  if (num_cores > 0) {
+    num_cores_ = num_cores;
+  } else {
+    num_cores_ = 1;
+  }
+
+  initialized_ = true;
+}
+
+void CpuInfo::VerifyCpuRequirements() {
+  if (!CpuInfo::IsSupported(CpuInfo::SSSE3)) {
+    throw ParquetException("CPU does not support the Supplemental SSE3 instruction set");
+  }
+}
+
+void CpuInfo::EnableFeature(int64_t flag, bool enable) {
+  DCHECK(initialized_);
+  if (!enable) {
+    hardware_flags_ &= ~flag;
+  } else {
+    // Can't turn something on that can't be supported
+    DCHECK((original_hardware_flags_ & flag) != 0);
+    hardware_flags_ |= flag;
+  }
+}
+
+} // namespace parquet_cpp
diff --git a/cpp/src/parquet/util/cpu-info.h b/cpp/src/parquet/util/cpu-info.h
new file mode 100644
index 0000000000000..9026cde6630ec
--- /dev/null
+++ b/cpp/src/parquet/util/cpu-info.h
@@ -0,0 +1,108 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// From Apache Impala as of 2016-01-29. Pared down to a minimal set of
+// functions needed for parquet-cpp
+
+#ifndef PARQUET_UTIL_CPU_INFO_H
+#define PARQUET_UTIL_CPU_INFO_H
+
+#include <cstdint>
+#include <string>
+
+#include "parquet/util/logging.h"
+
+namespace parquet_cpp {
+
+/// CpuInfo is an interface to query for cpu information at runtime.  The caller can
+/// ask for the sizes of the caches and what hardware features are supported.
+/// On Linux, this information is pulled from a couple of sys files (/proc/cpuinfo and
+/// /sys/devices)
+class CpuInfo {
+ public:
+  static const int64_t SSSE3   = (1 << 1);
+  static const int64_t SSE4_1  = (1 << 2);
+  static const int64_t SSE4_2  = (1 << 3);
+  static const int64_t POPCNT  = (1 << 4);
+
+  /// Cache enums for L1 (data), L2 and L3
+  enum CacheLevel {
+    L1_CACHE = 0,
+    L2_CACHE = 1,
+    L3_CACHE = 2,
+  };
+
+  /// Initialize CpuInfo.
+  static void Init();
+
+  /// Determine if the CPU meets the minimum CPU requirements and if not, issue an error
+  /// and terminate.
+  static void VerifyCpuRequirements();
+
+  /// Returns all the flags for this cpu
+  static int64_t hardware_flags() {
+    DCHECK(initialized_);
+    return hardware_flags_;
+  }
+
+  /// Returns whether of not the cpu supports this flag
+  inline static bool IsSupported(int64_t flag) {
+    DCHECK(initialized_);
+    return (hardware_flags_ & flag) != 0;
+  }
+
+  /// Toggle a hardware feature on and off.  It is not valid to turn on a feature
+  /// that the underlying hardware cannot support. This is useful for testing.
+  static void EnableFeature(int64_t flag, bool enable);
+
+  /// Returns the size of the cache in KB at this cache level
+  static int64_t CacheSize(CacheLevel level) {
+    DCHECK(initialized_);
+    return cache_sizes_[level];
+  }
+
+  /// Returns the number of cpu cycles per millisecond
+  static int64_t cycles_per_ms() {
+    DCHECK(initialized_);
+    return cycles_per_ms_;
+  }
+
+  /// Returns the number of cores (including hyper-threaded) on this machine.
+  static int num_cores() {
+    DCHECK(initialized_);
+    return num_cores_;
+  }
+
+  /// Returns the model name of the cpu (e.g. Intel i7-2600)
+  static std::string model_name() {
+    DCHECK(initialized_);
+    return model_name_;
+  }
+
+ private:
+  static bool initialized_;
+  static int64_t hardware_flags_;
+  static int64_t original_hardware_flags_;
+  static int64_t cache_sizes_[L3_CACHE + 1];
+  static int64_t cycles_per_ms_;
+  static int num_cores_;
+  static std::string model_name_; // NOLINT
+};
+
+} // namespace parquet_cpp
+
+#endif // PARQUET_UTIL_CPU_INFO_H
diff --git a/cpp/src/parquet/util/rle-encoding.h b/cpp/src/parquet/util/rle-encoding.h
index e65368e7fc7a2..22b2c2fcaf0c6 100644
--- a/cpp/src/parquet/util/rle-encoding.h
+++ b/cpp/src/parquet/util/rle-encoding.h
@@ -15,74 +15,75 @@
 // specific language governing permissions and limitations
 // under the License.
 
+// From Apache Impala as of 2016-01-29
+
 #ifndef PARQUET_UTIL_RLE_ENCODING_H
 #define PARQUET_UTIL_RLE_ENCODING_H
 
-#include <math.h>
 #include <algorithm>
+#include <math.h>
 
 #include "parquet/util/compiler-util.h"
 #include "parquet/util/bit-stream-utils.inline.h"
 #include "parquet/util/bit-util.h"
-#include "parquet/util/logging.h"
 
 namespace parquet_cpp {
 
-// Utility classes to do run length encoding (RLE) for fixed bit width values.  If runs
-// are sufficiently long, RLE is used, otherwise, the values are just bit-packed
-// (literal encoding).
-// For both types of runs, there is a byte-aligned indicator which encodes the length
-// of the run and the type of the run.
-// This encoding has the benefit that when there aren't any long enough runs, values
-// are always decoded at fixed (can be precomputed) bit offsets OR both the value and
-// the run length are byte aligned. This allows for very efficient decoding
-// implementations.
-// The encoding is:
-//    encoded-block := run*
-//    run := literal-run | repeated-run
-//    literal-run := literal-indicator < literal bytes >
-//    repeated-run := repeated-indicator < repeated value. padded to byte boundary >
-//    literal-indicator := varint_encode( number_of_groups << 1 | 1)
-//    repeated-indicator := varint_encode( number_of_repetitions << 1 )
+/// Utility classes to do run length encoding (RLE) for fixed bit width values.  If runs
+/// are sufficiently long, RLE is used, otherwise, the values are just bit-packed
+/// (literal encoding).
+/// For both types of runs, there is a byte-aligned indicator which encodes the length
+/// of the run and the type of the run.
+/// This encoding has the benefit that when there aren't any long enough runs, values
+/// are always decoded at fixed (can be precomputed) bit offsets OR both the value and
+/// the run length are byte aligned. This allows for very efficient decoding
+/// implementations.
+/// The encoding is:
+///    encoded-block := run*
+///    run := literal-run | repeated-run
+///    literal-run := literal-indicator < literal bytes >
+///    repeated-run := repeated-indicator < repeated value. padded to byte boundary >
+///    literal-indicator := varint_encode( number_of_groups << 1 | 1)
+///    repeated-indicator := varint_encode( number_of_repetitions << 1 )
 //
-// Each run is preceded by a varint. The varint's least significant bit is
-// used to indicate whether the run is a literal run or a repeated run. The rest
-// of the varint is used to determine the length of the run (eg how many times the
-// value repeats).
+/// Each run is preceded by a varint. The varint's least significant bit is
+/// used to indicate whether the run is a literal run or a repeated run. The rest
+/// of the varint is used to determine the length of the run (eg how many times the
+/// value repeats).
 //
-// In the case of literal runs, the run length is always a multiple of 8 (i.e. encode
-// in groups of 8), so that no matter the bit-width of the value, the sequence will end
-// on a byte boundary without padding.
-// Given that we know it is a multiple of 8, we store the number of 8-groups rather than
-// the actual number of encoded ints. (This means that the total number of encoded values
-// can not be determined from the encoded data, since the number of values in the last
-// group may not be a multiple of 8). For the last group of literal runs, we pad
-// the group to 8 with zeros. This allows for 8 at a time decoding on the read side
-// without the need for additional checks.
+/// In the case of literal runs, the run length is always a multiple of 8 (i.e. encode
+/// in groups of 8), so that no matter the bit-width of the value, the sequence will end
+/// on a byte boundary without padding.
+/// Given that we know it is a multiple of 8, we store the number of 8-groups rather than
+/// the actual number of encoded ints. (This means that the total number of encoded values
+/// can not be determined from the encoded data, since the number of values in the last
+/// group may not be a multiple of 8). For the last group of literal runs, we pad
+/// the group to 8 with zeros. This allows for 8 at a time decoding on the read side
+/// without the need for additional checks.
 //
-// There is a break-even point when it is more storage efficient to do run length
-// encoding.  For 1 bit-width values, that point is 8 values.  They require 2 bytes
-// for both the repeated encoding or the literal encoding.  This value can always
-// be computed based on the bit-width.
-// TODO: think about how to use this for strings.  The bit packing isn't quite the same.
+/// There is a break-even point when it is more storage efficient to do run length
+/// encoding.  For 1 bit-width values, that point is 8 values.  They require 2 bytes
+/// for both the repeated encoding or the literal encoding.  This value can always
+/// be computed based on the bit-width.
+/// TODO: think about how to use this for strings.  The bit packing isn't quite the same.
 //
-// Examples with bit-width 1 (eg encoding booleans):
-// ----------------------------------------
-// 100 1s followed by 100 0s:
-// <varint(100 << 1)> <1, padded to 1 byte>  <varint(100 << 1)> <0, padded to 1 byte>
-//  - (total 4 bytes)
+/// Examples with bit-width 1 (eg encoding booleans):
+/// ----------------------------------------
+/// 100 1s followed by 100 0s:
+/// <varint(100 << 1)> <1, padded to 1 byte>  <varint(100 << 1)> <0, padded to 1 byte>
+///  - (total 4 bytes)
 //
-// alternating 1s and 0s (200 total):
-// 200 ints = 25 groups of 8
-// <varint((25 << 1) | 1)> <25 bytes of values, bitpacked>
-// (total 26 bytes, 1 byte overhead)
+/// alternating 1s and 0s (200 total):
+/// 200 ints = 25 groups of 8
+/// <varint((25 << 1) | 1)> <25 bytes of values, bitpacked>
+/// (total 26 bytes, 1 byte overhead)
 //
 
-// Decoder class for RLE encoded data.
+/// Decoder class for RLE encoded data.
 class RleDecoder {
  public:
-  // Create a decoder object. buffer/buffer_len is the decoded data.
-  // bit_width is the width of each value (before encoding).
+  /// Create a decoder object. buffer/buffer_len is the decoded data.
+  /// bit_width is the width of each value (before encoding).
   RleDecoder(const uint8_t* buffer, int buffer_len, int bit_width)
     : bit_reader_(buffer, buffer_len),
       bit_width_(bit_width),
@@ -93,170 +94,172 @@ class RleDecoder {
     DCHECK_LE(bit_width_, 64);
   }
 
-  RleDecoder() {}
+  RleDecoder() : bit_width_(-1) {}
+
+  void Reset(const uint8_t* buffer, int buffer_len, int bit_width) {
+    DCHECK_GE(bit_width, 0);
+    DCHECK_LE(bit_width, 64);
+    bit_reader_.Reset(buffer, buffer_len);
+    bit_width_ = bit_width;
+    current_value_ = 0;
+    repeat_count_ = 0;
+    literal_count_ = 0;
+  }
 
-  // Gets the next value.  Returns false if there are no more.
+  /// Gets the next value.  Returns false if there are no more.
   template<typename T>
   bool Get(T* val);
 
- private:
+ protected:
   BitReader bit_reader_;
+  /// Number of bits needed to encode the value. Must be between 0 and 64.
   int bit_width_;
   uint64_t current_value_;
   uint32_t repeat_count_;
   uint32_t literal_count_;
+
+ private:
+  /// Fills literal_count_ and repeat_count_ with next values. Returns false if there
+  /// are no more.
+  template<typename T>
+  bool NextCounts();
 };
 
-// Class to incrementally build the rle data.   This class does not allocate any memory.
-// The encoding has two modes: encoding repeated runs and literal runs.
-// If the run is sufficiently short, it is more efficient to encode as a literal run.
-// This class does so by buffering 8 values at a time.  If they are not all the same
-// they are added to the literal run.  If they are the same, they are added to the
-// repeated run.  When we switch modes, the previous run is flushed out.
+/// Class to incrementally build the rle data.   This class does not allocate any memory.
+/// The encoding has two modes: encoding repeated runs and literal runs.
+/// If the run is sufficiently short, it is more efficient to encode as a literal run.
+/// This class does so by buffering 8 values at a time.  If they are not all the same
+/// they are added to the literal run.  If they are the same, they are added to the
+/// repeated run.  When we switch modes, the previous run is flushed out.
 class RleEncoder {
  public:
-  // buffer/buffer_len: preallocated output buffer.
-  // bit_width: max number of bits for value.
-  // TODO: consider adding a min_repeated_run_length so the caller can control
-  // when values should be encoded as repeated runs.  Currently this is derived
-  // based on the bit_width, which can determine a storage optimal choice.
-  // TODO: allow 0 bit_width (and have dict encoder use it)
+  /// buffer/buffer_len: preallocated output buffer.
+  /// bit_width: max number of bits for value.
+  /// TODO: consider adding a min_repeated_run_length so the caller can control
+  /// when values should be encoded as repeated runs.  Currently this is derived
+  /// based on the bit_width, which can determine a storage optimal choice.
+  /// TODO: allow 0 bit_width (and have dict encoder use it)
   RleEncoder(uint8_t* buffer, int buffer_len, int bit_width)
     : bit_width_(bit_width),
       bit_writer_(buffer, buffer_len) {
-    DCHECK_GE(bit_width_, 1);
+    DCHECK_GE(bit_width_, 0);
     DCHECK_LE(bit_width_, 64);
     max_run_byte_size_ = MinBufferSize(bit_width);
     DCHECK_GE(buffer_len, max_run_byte_size_) << "Input buffer not big enough.";
     Clear();
   }
 
-  // Returns the minimum buffer size needed to use the encoder for 'bit_width'
-  // This is the maximum length of a single run for 'bit_width'.
-  // It is not valid to pass a buffer less than this length.
+  /// Returns the minimum buffer size needed to use the encoder for 'bit_width'
+  /// This is the maximum length of a single run for 'bit_width'.
+  /// It is not valid to pass a buffer less than this length.
   static int MinBufferSize(int bit_width) {
-    // 1 indicator byte and MAX_VALUES_PER_LITERAL_RUN 'bit_width' values.
+    /// 1 indicator byte and MAX_VALUES_PER_LITERAL_RUN 'bit_width' values.
     int max_literal_run_size = 1 +
         BitUtil::Ceil(MAX_VALUES_PER_LITERAL_RUN * bit_width, 8);
-    // Up to MAX_VLQ_BYTE_LEN indicator and a single 'bit_width' value.
+    /// Up to MAX_VLQ_BYTE_LEN indicator and a single 'bit_width' value.
     int max_repeated_run_size = BitReader::MAX_VLQ_BYTE_LEN + BitUtil::Ceil(bit_width, 8);
     return std::max(max_literal_run_size, max_repeated_run_size);
   }
 
-  // Returns the maximum byte size it could take to encode 'num_values'.
+  /// Returns the maximum byte size it could take to encode 'num_values'.
   static int MaxBufferSize(int bit_width, int num_values) {
     int bytes_per_run = BitUtil::Ceil(bit_width * MAX_VALUES_PER_LITERAL_RUN, 8.0);
     int num_runs = BitUtil::Ceil(num_values, MAX_VALUES_PER_LITERAL_RUN);
     int literal_max_size = num_runs + num_runs * bytes_per_run;
-    int min_run_size = MinBufferSize(bit_width);
-    return std::max(min_run_size, literal_max_size) + min_run_size;
+    return std::max(MinBufferSize(bit_width), literal_max_size);
   }
 
-  // Encode value.  Returns true if the value fits in buffer, false otherwise.
-  // This value must be representable with bit_width_ bits.
+  /// Encode value.  Returns true if the value fits in buffer, false otherwise.
+  /// This value must be representable with bit_width_ bits.
   bool Put(uint64_t value);
 
-  // Flushes any pending values to the underlying buffer.
-  // Returns the total number of bytes written
+  /// Flushes any pending values to the underlying buffer.
+  /// Returns the total number of bytes written
   int Flush();
 
-  // Resets all the state in the encoder.
+  /// Resets all the state in the encoder.
   void Clear();
 
-  // Returns pointer to underlying buffer
+  /// Returns pointer to underlying buffer
   uint8_t* buffer() { return bit_writer_.buffer(); }
   int32_t len() { return bit_writer_.bytes_written(); }
 
  private:
-  // Flushes any buffered values.  If this is part of a repeated run, this is largely
-  // a no-op.
-  // If it is part of a literal run, this will call FlushLiteralRun, which writes
-  // out the buffered literal values.
-  // If 'done' is true, the current run would be written even if it would normally
-  // have been buffered more.  This should only be called at the end, when the
-  // encoder has received all values even if it would normally continue to be
-  // buffered.
+  /// Flushes any buffered values.  If this is part of a repeated run, this is largely
+  /// a no-op.
+  /// If it is part of a literal run, this will call FlushLiteralRun, which writes
+  /// out the buffered literal values.
+  /// If 'done' is true, the current run would be written even if it would normally
+  /// have been buffered more.  This should only be called at the end, when the
+  /// encoder has received all values even if it would normally continue to be
+  /// buffered.
   void FlushBufferedValues(bool done);
 
-  // Flushes literal values to the underlying buffer.  If update_indicator_byte,
-  // then the current literal run is complete and the indicator byte is updated.
+  /// Flushes literal values to the underlying buffer.  If update_indicator_byte,
+  /// then the current literal run is complete and the indicator byte is updated.
   void FlushLiteralRun(bool update_indicator_byte);
 
-  // Flushes a repeated run to the underlying buffer.
+  /// Flushes a repeated run to the underlying buffer.
   void FlushRepeatedRun();
 
-  // Checks and sets buffer_full_. This must be called after flushing a run to
-  // make sure there are enough bytes remaining to encode the next run.
+  /// Checks and sets buffer_full_. This must be called after flushing a run to
+  /// make sure there are enough bytes remaining to encode the next run.
   void CheckBufferFull();
 
-  // The maximum number of values in a single literal run
-  // (number of groups encodable by a 1-byte indicator * 8)
+  /// The maximum number of values in a single literal run
+  /// (number of groups encodable by a 1-byte indicator * 8)
   static const int MAX_VALUES_PER_LITERAL_RUN = (1 << 6) * 8;
 
-  // Number of bits needed to encode the value.
+  /// Number of bits needed to encode the value. Must be between 0 and 64.
   const int bit_width_;
 
-  // Underlying buffer.
+  /// Underlying buffer.
   BitWriter bit_writer_;
 
-  // If true, the buffer is full and subsequent Put()'s will fail.
+  /// If true, the buffer is full and subsequent Put()'s will fail.
   bool buffer_full_;
 
-  // The maximum byte size a single run can take.
+  /// The maximum byte size a single run can take.
   int max_run_byte_size_;
 
-  // We need to buffer at most 8 values for literals.  This happens when the
-  // bit_width is 1 (so 8 values fit in one byte).
-  // TODO: generalize this to other bit widths
+  /// We need to buffer at most 8 values for literals.  This happens when the
+  /// bit_width is 1 (so 8 values fit in one byte).
+  /// TODO: generalize this to other bit widths
   int64_t buffered_values_[8];
 
-  // Number of values in buffered_values_
+  /// Number of values in buffered_values_
   int num_buffered_values_;
 
-  // The current (also last) value that was written and the count of how
-  // many times in a row that value has been seen.  This is maintained even
-  // if we are in a literal run.  If the repeat_count_ get high enough, we switch
-  // to encoding repeated runs.
+  /// The current (also last) value that was written and the count of how
+  /// many times in a row that value has been seen.  This is maintained even
+  /// if we are in a literal run.  If the repeat_count_ get high enough, we switch
+  /// to encoding repeated runs.
   int64_t current_value_;
   int repeat_count_;
 
-  // Number of literals in the current run.  This does not include the literals
-  // that might be in buffered_values_.  Only after we've got a group big enough
-  // can we decide if they should part of the literal_count_ or repeat_count_
+  /// Number of literals in the current run.  This does not include the literals
+  /// that might be in buffered_values_.  Only after we've got a group big enough
+  /// can we decide if they should part of the literal_count_ or repeat_count_
   int literal_count_;
 
-  // Pointer to a byte in the underlying buffer that stores the indicator byte.
-  // This is reserved as soon as we need a literal run but the value is written
-  // when the literal run is complete.
+  /// Pointer to a byte in the underlying buffer that stores the indicator byte.
+  /// This is reserved as soon as we need a literal run but the value is written
+  /// when the literal run is complete.
   uint8_t* literal_indicator_byte_;
 };
 
 template<typename T>
 inline bool RleDecoder::Get(T* val) {
+  DCHECK_GE(bit_width_, 0);
   if (UNLIKELY(literal_count_ == 0 && repeat_count_ == 0)) {
-    // Read the next run's indicator int, it could be a literal or repeated run
-    // The int is encoded as a vlq-encoded value.
-    uint64_t indicator_value = 0;
-    bool result = bit_reader_.GetVlqInt(&indicator_value);
-    if (!result) return false;
-
-    // lsb indicates if it is a literal run or repeated run
-    bool is_literal = indicator_value & 1;
-    if (is_literal) {
-      literal_count_ = (indicator_value >> 1) * 8;
-    } else {
-      repeat_count_ = indicator_value >> 1;
-      bool result = bit_reader_.GetAligned<T>(
-          BitUtil::Ceil(bit_width_, 8), reinterpret_cast<T*>(&current_value_));
-      DCHECK(result);
-    }
+    if (!NextCounts<T>()) return false;
   }
 
   if (LIKELY(repeat_count_ > 0)) {
     *val = current_value_;
     --repeat_count_;
   } else {
-    DCHECK(literal_count_ > 0);
+    DCHECK_GT(literal_count_, 0);
     bool result = bit_reader_.GetValue(bit_width_, val);
     DCHECK(result);
     --literal_count_;
@@ -265,8 +268,29 @@ inline bool RleDecoder::Get(T* val) {
   return true;
 }
 
-// This function buffers input values 8 at a time.  After seeing all 8 values,
-// it decides whether they should be encoded as a literal or repeated run.
+template<typename T>
+bool RleDecoder::NextCounts() {
+  // Read the next run's indicator int, it could be a literal or repeated run.
+  // The int is encoded as a vlq-encoded value.
+  int32_t indicator_value = 0;
+  bool result = bit_reader_.GetVlqInt(&indicator_value);
+  if (!result) return false;
+
+  // lsb indicates if it is a literal run or repeated run
+  bool is_literal = indicator_value & 1;
+  if (is_literal) {
+    literal_count_ = (indicator_value >> 1) * 8;
+  } else {
+    repeat_count_ = indicator_value >> 1;
+    bool result = bit_reader_.GetAligned<T>(
+        BitUtil::Ceil(bit_width_, 8), reinterpret_cast<T*>(&current_value_));
+    DCHECK(result);
+  }
+  return true;
+}
+
+/// This function buffers input values 8 at a time.  After seeing all 8 values,
+/// it decides whether they should be encoded as a literal or repeated run.
 inline bool RleEncoder::Put(uint64_t value) {
   DCHECK(bit_width_ == 64 || value < (1LL << bit_width_));
   if (UNLIKELY(buffer_full_)) return false;
@@ -341,8 +365,8 @@ inline void RleEncoder::FlushRepeatedRun() {
   CheckBufferFull();
 }
 
-// Flush the values that have been buffered.  At this point we decide whether
-// we need to switch between the run types or continue the current one.
+/// Flush the values that have been buffered.  At this point we decide whether
+/// we need to switch between the run types or continue the current one.
 inline void RleEncoder::FlushBufferedValues(bool done) {
   if (repeat_count_ >= 8) {
     // Clear the buffered values.  They are part of the repeated run now and we
diff --git a/cpp/src/parquet/util/rle-test.cc b/cpp/src/parquet/util/rle-test.cc
new file mode 100644
index 0000000000000..b2628e981d6e1
--- /dev/null
+++ b/cpp/src/parquet/util/rle-test.cc
@@ -0,0 +1,400 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// From Apache Impala as of 2016-01-29
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <cstdint>
+#include <iostream>
+#include <vector>
+
+#include <boost/utility.hpp>
+#include <gtest/gtest.h>
+#include <math.h>
+
+#include "parquet/util/rle-encoding.h"
+#include "parquet/util/bit-stream-utils.inline.h"
+
+using std::vector;
+
+namespace parquet_cpp {
+
+const int MAX_WIDTH = 32;
+
+TEST(BitArray, TestBool) {
+  const int len = 8;
+  uint8_t buffer[len];
+
+  BitWriter writer(buffer, len);
+
+  // Write alternating 0's and 1's
+  for (int i = 0; i < 8; ++i) {
+    bool result = writer.PutValue(i % 2, 1);
+    EXPECT_TRUE(result);
+  }
+  writer.Flush();
+  EXPECT_EQ((int)buffer[0], BOOST_BINARY(1 0 1 0 1 0 1 0));
+
+  // Write 00110011
+  for (int i = 0; i < 8; ++i) {
+    bool result = false;
+    switch (i) {
+      case 0:
+      case 1:
+      case 4:
+      case 5:
+        result = writer.PutValue(false, 1);
+        break;
+      default:
+        result = writer.PutValue(true, 1);
+        break;
+    }
+    EXPECT_TRUE(result);
+  }
+  writer.Flush();
+
+  // Validate the exact bit value
+  EXPECT_EQ((int)buffer[0], BOOST_BINARY(1 0 1 0 1 0 1 0));
+  EXPECT_EQ((int)buffer[1], BOOST_BINARY(1 1 0 0 1 1 0 0));
+
+  // Use the reader and validate
+  BitReader reader(buffer, len);
+  for (int i = 0; i < 8; ++i) {
+    bool val = false;
+    bool result = reader.GetValue(1, &val);
+    EXPECT_TRUE(result);
+    EXPECT_EQ(val, i % 2);
+  }
+
+  for (int i = 0; i < 8; ++i) {
+    bool val = false;
+    bool result = reader.GetValue(1, &val);
+    EXPECT_TRUE(result);
+    switch (i) {
+      case 0:
+      case 1:
+      case 4:
+      case 5:
+        EXPECT_EQ(val, false);
+        break;
+      default:
+        EXPECT_EQ(val, true);
+        break;
+    }
+  }
+}
+
+// Writes 'num_vals' values with width 'bit_width' and reads them back.
+void TestBitArrayValues(int bit_width, int num_vals) {
+  const int len = BitUtil::Ceil(bit_width * num_vals, 8);
+  const uint64_t mod = bit_width == 64? 1 : 1LL << bit_width;
+
+  uint8_t buffer[len];
+  BitWriter writer(buffer, len);
+  for (int i = 0; i < num_vals; ++i) {
+    bool result = writer.PutValue(i % mod, bit_width);
+    EXPECT_TRUE(result);
+  }
+  writer.Flush();
+  EXPECT_EQ(writer.bytes_written(), len);
+
+  BitReader reader(buffer, len);
+  for (int i = 0; i < num_vals; ++i) {
+    int64_t val;
+    bool result = reader.GetValue(bit_width, &val);
+    EXPECT_TRUE(result);
+    EXPECT_EQ(val, i % mod);
+  }
+  EXPECT_EQ(reader.bytes_left(), 0);
+}
+
+TEST(BitArray, TestValues) {
+  for (int width = 0; width <= MAX_WIDTH; ++width) {
+    TestBitArrayValues(width, 1);
+    TestBitArrayValues(width, 2);
+    // Don't write too many values
+    TestBitArrayValues(width, (width < 12) ? (1 << width) : 4096);
+    TestBitArrayValues(width, 1024);
+  }
+}
+
+// Test some mixed values
+TEST(BitArray, TestMixed) {
+  const int len = 1024;
+  uint8_t buffer[len];
+  bool parity = true;
+
+  BitWriter writer(buffer, len);
+  for (int i = 0; i < len; ++i) {
+    bool result;
+    if (i % 2 == 0) {
+      result = writer.PutValue(parity, 1);
+      parity = !parity;
+    } else {
+      result = writer.PutValue(i, 10);
+    }
+    EXPECT_TRUE(result);
+  }
+  writer.Flush();
+
+  parity = true;
+  BitReader reader(buffer, len);
+  for (int i = 0; i < len; ++i) {
+    bool result;
+    if (i % 2 == 0) {
+      bool val;
+      result = reader.GetValue(1, &val);
+      EXPECT_EQ(val, parity);
+      parity = !parity;
+    } else {
+      int val;
+      result = reader.GetValue(10, &val);
+      EXPECT_EQ(val, i);
+    }
+    EXPECT_TRUE(result);
+  }
+}
+
+// Validates encoding of values by encoding and decoding them.  If
+// expected_encoding != NULL, also validates that the encoded buffer is
+// exactly 'expected_encoding'.
+// if expected_len is not -1, it will validate the encoded size is correct.
+void ValidateRle(const vector<int>& values, int bit_width,
+                 uint8_t* expected_encoding, int expected_len) {
+  const int len = 64 * 1024;
+  uint8_t buffer[len];
+  EXPECT_LE(expected_len, len);
+
+  RleEncoder encoder(buffer, len, bit_width);
+  for (int i = 0; i < values.size(); ++i) {
+    bool result = encoder.Put(values[i]);
+    EXPECT_TRUE(result);
+  }
+  int encoded_len = encoder.Flush();
+
+  if (expected_len != -1) {
+    EXPECT_EQ(encoded_len, expected_len);
+  }
+  if (expected_encoding != NULL) {
+    EXPECT_TRUE(memcmp(buffer, expected_encoding, expected_len) == 0);
+  }
+
+  // Verify read
+  RleDecoder decoder(buffer, len, bit_width);
+  for (int i = 0; i < values.size(); ++i) {
+    uint64_t val;
+    bool result = decoder.Get(&val);
+    EXPECT_TRUE(result);
+    EXPECT_EQ(values[i], val);
+  }
+}
+
+TEST(Rle, SpecificSequences) {
+  const int len = 1024;
+  uint8_t expected_buffer[len];
+  vector<int> values;
+
+  // Test 50 0' followed by 50 1's
+  values.resize(100);
+  for (int i = 0; i < 50; ++i) {
+    values[i] = 0;
+  }
+  for (int i = 50; i < 100; ++i) {
+    values[i] = 1;
+  }
+
+  // expected_buffer valid for bit width <= 1 byte
+  expected_buffer[0] = (50 << 1);
+  expected_buffer[1] = 0;
+  expected_buffer[2] = (50 << 1);
+  expected_buffer[3] = 1;
+  for (int width = 1; width <= 8; ++width) {
+    ValidateRle(values, width, expected_buffer, 4);
+  }
+
+  for (int width = 9; width <= MAX_WIDTH; ++width) {
+    ValidateRle(values, width, NULL, 2 * (1 + BitUtil::Ceil(width, 8)));
+  }
+
+  // Test 100 0's and 1's alternating
+  for (int i = 0; i < 100; ++i) {
+    values[i] = i % 2;
+  }
+  int num_groups = BitUtil::Ceil(100, 8);
+  expected_buffer[0] = (num_groups << 1) | 1;
+  for (int i = 1; i <= 100/8; ++i) {
+    expected_buffer[i] = BOOST_BINARY(1 0 1 0 1 0 1 0);
+  }
+  // Values for the last 4 0 and 1's. The upper 4 bits should be padded to 0.
+  expected_buffer[100/8 + 1] = BOOST_BINARY(0 0 0 0 1 0 1 0);
+
+  // num_groups and expected_buffer only valid for bit width = 1
+  ValidateRle(values, 1, expected_buffer, 1 + num_groups);
+  for (int width = 2; width <= MAX_WIDTH; ++width) {
+    int num_values = BitUtil::Ceil(100, 8) * 8;
+    ValidateRle(values, width, NULL, 1 + BitUtil::Ceil(width * num_values, 8));
+  }
+}
+
+// ValidateRle on 'num_vals' values with width 'bit_width'. If 'value' != -1, that value
+// is used, otherwise alternating values are used.
+void TestRleValues(int bit_width, int num_vals, int value = -1) {
+  const uint64_t mod = (bit_width == 64) ? 1 : 1LL << bit_width;
+  vector<int> values;
+  for (int v = 0; v < num_vals; ++v) {
+    values.push_back((value != -1) ? value : (v % mod));
+  }
+  ValidateRle(values, bit_width, NULL, -1);
+}
+
+TEST(Rle, TestValues) {
+  for (int width = 1; width <= MAX_WIDTH; ++width) {
+    TestRleValues(width, 1);
+    TestRleValues(width, 1024);
+    TestRleValues(width, 1024, 0);
+    TestRleValues(width, 1024, 1);
+  }
+}
+
+TEST(Rle, BitWidthZeroRepeated) {
+  uint8_t buffer[1];
+  const int num_values = 15;
+  buffer[0] = num_values << 1; // repeated indicator byte
+  RleDecoder decoder(buffer, sizeof(buffer), 0);
+  uint8_t val;
+  for (int i = 0; i < num_values; ++i) {
+    bool result = decoder.Get(&val);
+    EXPECT_TRUE(result);
+    EXPECT_EQ(val, 0); // can only encode 0s with bit width 0
+  }
+  EXPECT_FALSE(decoder.Get(&val));
+}
+
+TEST(Rle, BitWidthZeroLiteral) {
+  uint8_t buffer[1];
+  const int num_groups = 4;
+  buffer[0] = num_groups << 1 | 1; // literal indicator byte
+  RleDecoder decoder = RleDecoder(buffer, sizeof(buffer), 0);
+  const int num_values = num_groups * 8;
+  uint8_t val;
+  for (int i = 0; i < num_values; ++i) {
+    bool result = decoder.Get(&val);
+    EXPECT_TRUE(result);
+    EXPECT_EQ(val, 0); // can only encode 0s with bit width 0
+  }
+  EXPECT_FALSE(decoder.Get(&val));
+}
+
+// Test that writes out a repeated group and then a literal
+// group but flush before finishing.
+TEST(BitRle, Flush) {
+  vector<int> values;
+  for (int i = 0; i < 16; ++i) values.push_back(1);
+  values.push_back(0);
+  ValidateRle(values, 1, NULL, -1);
+  values.push_back(1);
+  ValidateRle(values, 1, NULL, -1);
+  values.push_back(1);
+  ValidateRle(values, 1, NULL, -1);
+  values.push_back(1);
+  ValidateRle(values, 1, NULL, -1);
+}
+
+// Test some random sequences.
+TEST(BitRle, Random) {
+  int iters = 0;
+  while (iters < 1000) {
+    srand(iters++);
+    if (iters % 10000 == 0) LOG(ERROR) << "Seed: " << iters;
+    vector<int> values;
+    bool parity = 0;
+    for (int i = 0; i < 1000; ++i) {
+      int group_size = rand() % 20 + 1;  // NOLINT
+      if (group_size > 16) {
+        group_size = 1;
+      }
+      for (int i = 0; i < group_size; ++i) {
+        values.push_back(parity);
+      }
+      parity = !parity;
+    }
+    ValidateRle(values, (iters % MAX_WIDTH) + 1, NULL, -1);
+  }
+}
+
+// Test a sequence of 1 0's, 2 1's, 3 0's. etc
+// e.g. 011000111100000
+TEST(BitRle, RepeatedPattern) {
+  vector<int> values;
+  const int min_run = 1;
+  const int max_run = 32;
+
+  for (int i = min_run; i <= max_run; ++i) {
+    int v = i % 2;
+    for (int j = 0; j < i; ++j) {
+      values.push_back(v);
+    }
+  }
+
+  // And go back down again
+  for (int i = max_run; i >= min_run; --i) {
+    int v = i % 2;
+    for (int j = 0; j < i; ++j) {
+      values.push_back(v);
+    }
+  }
+
+  ValidateRle(values, 1, NULL, -1);
+}
+
+TEST(BitRle, Overflow) {
+  for (int bit_width = 1; bit_width < 32; bit_width += 3) {
+    const int len = RleEncoder::MinBufferSize(bit_width);
+    uint8_t buffer[len];
+    int num_added = 0;
+    bool parity = true;
+
+    RleEncoder encoder(buffer, len, bit_width);
+    // Insert alternating true/false until there is no space left
+    while (true) {
+      bool result = encoder.Put(parity);
+      parity = !parity;
+      if (!result) break;
+      ++num_added;
+    }
+
+    int bytes_written = encoder.Flush();
+    EXPECT_LE(bytes_written, len);
+    EXPECT_GT(num_added, 0);
+
+    RleDecoder decoder(buffer, bytes_written, bit_width);
+    parity = true;
+    uint32_t v;
+    for (int i = 0; i < num_added; ++i) {
+      bool result = decoder.Get(&v);
+      EXPECT_TRUE(result);
+      EXPECT_EQ(v, parity);
+      parity = !parity;
+    }
+    // Make sure we get false when reading past end a couple times.
+    EXPECT_FALSE(decoder.Get(&v));
+    EXPECT_FALSE(decoder.Get(&v));
+  }
+}
+
+} // namespace parquet_cpp
diff --git a/cpp/src/parquet/util/sse-util.h b/cpp/src/parquet/util/sse-util.h
new file mode 100644
index 0000000000000..588c30a07f238
--- /dev/null
+++ b/cpp/src/parquet/util/sse-util.h
@@ -0,0 +1,191 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// From Apache Impala as of 2016-01-29. Pared down to a minimal set of
+// functions needed for parquet-cpp
+
+#ifndef PARQUET_UTIL_SSE_UTIL_H
+#define PARQUET_UTIL_SSE_UTIL_H
+
+#include <emmintrin.h>
+
+namespace parquet_cpp {
+
+/// This class contains constants useful for text processing with SSE4.2 intrinsics.
+namespace SSEUtil {
+  /// Number of characters that fit in 64/128 bit register.  SSE provides instructions
+  /// for loading 64 or 128 bits into a register at a time.
+  static const int CHARS_PER_64_BIT_REGISTER = 8;
+  static const int CHARS_PER_128_BIT_REGISTER = 16;
+
+  /// SSE4.2 adds instructions for text processing.  The instructions have a control
+  /// byte that determines some of functionality of the instruction.  (Equivalent to
+  /// GCC's _SIDD_CMP_EQUAL_ANY, etc).
+  static const int PCMPSTR_EQUAL_ANY    = 0x00; // strchr
+  static const int PCMPSTR_EQUAL_EACH   = 0x08; // strcmp
+  static const int PCMPSTR_UBYTE_OPS    = 0x00; // unsigned char (8-bits, rather than 16)
+  static const int PCMPSTR_NEG_POLARITY = 0x10; // see Intel SDM chapter 4.1.4.
+
+  /// In this mode, SSE text processing functions will return a mask of all the
+  /// characters that matched.
+  static const int STRCHR_MODE = PCMPSTR_EQUAL_ANY | PCMPSTR_UBYTE_OPS;
+
+  /// In this mode, SSE text processing functions will return the number of
+  /// bytes that match consecutively from the beginning.
+  static const int STRCMP_MODE = PCMPSTR_EQUAL_EACH | PCMPSTR_UBYTE_OPS |
+      PCMPSTR_NEG_POLARITY;
+
+  /// Precomputed mask values up to 16 bits.
+  static const int SSE_BITMASK[CHARS_PER_128_BIT_REGISTER] = {
+    1 << 0,
+    1 << 1,
+    1 << 2,
+    1 << 3,
+    1 << 4,
+    1 << 5,
+    1 << 6,
+    1 << 7,
+    1 << 8,
+    1 << 9,
+    1 << 10,
+    1 << 11,
+    1 << 12,
+    1 << 13,
+    1 << 14,
+    1 << 15,
+  };
+} // namespace SSEUtil
+
+/// Define the SSE 4.2 intrinsics.  The caller must first verify at runtime (or codegen
+/// IR load time) that the processor supports SSE 4.2 before calling these.  These are
+/// defined outside the namespace because the IR w/ SSE 4.2 case needs to use macros.
+#ifndef IR_COMPILE
+/// When compiling to native code (i.e. not IR), we cannot use the -msse4.2 compiler
+/// flag.  Otherwise, the compiler will emit SSE 4.2 instructions outside of the runtime
+/// SSE 4.2 checks and Impala will crash on CPUs that don't support SSE 4.2
+/// (IMPALA-1399/1646).  The compiler intrinsics cannot be used without -msse4.2, so we
+/// define our own implementations of the intrinsics instead.
+
+#if defined(__SSE4_1__) || defined(__POPCNT__)
+/// Impala native code should not be compiled with -msse4.1 or higher until the minimum
+/// CPU requirement is raised to at least the targeted instruction set.
+#error "Do not compile with -msse4.1 or higher."
+#endif
+
+/// The PCMPxSTRy instructions require that the control byte 'mode' be encoded as an
+/// immediate.  So, those need to be always inlined in order to always propagate the
+/// mode constant into the inline asm.
+#define SSE_ALWAYS_INLINE inline __attribute__ ((__always_inline__))
+
+template<int MODE>
+static inline __m128i SSE4_cmpestrm(__m128i str1, int len1, __m128i str2, int len2) {
+  /// Use asm reg rather than Yz output constraint to workaround LLVM bug 13199 -
+  /// clang doesn't support Y-prefixed asm constraints.
+  register volatile __m128i result asm("xmm0");
+  __asm__ volatile ("pcmpestrm %5, %2, %1"
+      : "=x"(result) : "x"(str1), "xm"(str2), "a"(len1), "d"(len2), "i"(MODE) : "cc");
+  return result;
+}
+
+template<int MODE>
+static inline int SSE4_cmpestri(__m128i str1, int len1, __m128i str2, int len2) {
+  int result;
+  __asm__("pcmpestri %5, %2, %1"
+      : "=c"(result) : "x"(str1), "xm"(str2), "a"(len1), "d"(len2), "i"(MODE) : "cc");
+  return result;
+}
+
+static inline uint32_t SSE4_crc32_u8(uint32_t crc, uint8_t v) {
+  __asm__("crc32b %1, %0" : "+r"(crc) : "rm"(v));
+  return crc;
+}
+
+static inline uint32_t SSE4_crc32_u32(uint32_t crc, uint32_t v) {
+  __asm__("crc32l %1, %0" : "+r"(crc) : "rm"(v));
+  return crc;
+}
+
+static inline int64_t POPCNT_popcnt_u64(uint64_t a) {
+  int64_t result;
+  __asm__("popcntq %1, %0" : "=r"(result) : "mr"(a) : "cc");
+  return result;
+}
+
+#undef SSE_ALWAYS_INLINE
+
+#elif defined(__SSE4_2__) // IR_COMPILE for SSE 4.2.
+/// When cross-compiling to IR, we cannot use inline asm because LLVM JIT does not
+/// support it.  However, the cross-compiled IR is compiled twice: with and without
+/// -msse4.2.  When -msse4.2 is enabled in the cross-compile, we can just use the
+/// compiler intrinsics.
+
+#include <smmintrin.h>
+
+template<int MODE>
+static inline __m128i SSE4_cmpestrm(
+    __m128i str1, int len1, __m128i str2, int len2) {
+  return _mm_cmpestrm(str1, len1, str2, len2, MODE);
+}
+
+template<int MODE>
+static inline int SSE4_cmpestri(
+    __m128i str1, int len1, __m128i str2, int len2) {
+  return _mm_cmpestri(str1, len1, str2, len2, MODE);
+}
+
+#define SSE4_crc32_u8 _mm_crc32_u8
+#define SSE4_crc32_u32 _mm_crc32_u32
+#define POPCNT_popcnt_u64 _mm_popcnt_u64
+
+#else  // IR_COMPILE without SSE 4.2.
+/// When cross-compiling to IR without SSE 4.2 support (i.e. no -msse4.2), we cannot use
+/// SSE 4.2 instructions.  Otherwise, the IR loading will fail on CPUs that don't
+/// support SSE 4.2.  However, because the caller isn't allowed to call these routines
+/// on CPUs that lack SSE 4.2 anyway, we can implement stubs for this case.
+
+template<int MODE>
+static inline __m128i SSE4_cmpestrm(__m128i str1, int len1, __m128i str2, int len2) {
+  DCHECK(false) << "CPU doesn't support SSE 4.2";
+  return (__m128i) { 0 }; // NOLINT
+}
+
+template<int MODE>
+static inline int SSE4_cmpestri(__m128i str1, int len1, __m128i str2, int len2) {
+  DCHECK(false) << "CPU doesn't support SSE 4.2";
+  return 0;
+}
+
+static inline uint32_t SSE4_crc32_u8(uint32_t crc, uint8_t v) {
+  DCHECK(false) << "CPU doesn't support SSE 4.2";
+  return 0;
+}
+
+static inline uint32_t SSE4_crc32_u32(uint32_t crc, uint32_t v) {
+  DCHECK(false) << "CPU doesn't support SSE 4.2";
+  return 0;
+}
+
+static inline int64_t POPCNT_popcnt_u64(uint64_t a) {
+  DCHECK(false) << "CPU doesn't support SSE 4.2";
+  return 0;
+}
+
+#endif
+
+} // namespace parquet_cpp
+
+#endif //  PARQUET_UTIL_SSE_UTIL_H