Skip to content

Commit

Permalink
PARQUET-700: Disable dictionary encoding for boolean columns
Browse files Browse the repository at this point in the history
Author: Uwe L. Korn <uwelk@xhochy.com>

Closes apache#148 from xhochy/parquet-700 and squashes the following commits:

d33a670 [Uwe L. Korn] Format fixes
e8530ba [Uwe L. Korn] Also test writing booleans with Dictionary encoding
328b430 [Uwe L. Korn] Format fixes
ab33f9b [Uwe L. Korn] PARQUET-700: Disable dictionary encoding for boolean columns

Change-Id: Ic32e7d443bab3849ed001bbc9b5028c500c1f49b
  • Loading branch information
xhochy authored and wesm committed Sep 2, 2016
1 parent aa77a95 commit 265215d
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 16 deletions.
29 changes: 14 additions & 15 deletions cpp/src/parquet/column/column-writer-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -84,14 +84,22 @@ class TestPrimitiveWriter : public ::testing::Test {
reader_.reset(new TypedColumnReader<TestType>(schema_.get(), std::move(page_reader)));
}

std::unique_ptr<TypedColumnWriter<TestType>> BuildWriter(
std::shared_ptr<TypedColumnWriter<TestType>> BuildWriter(
int64_t output_size = SMALL_SIZE, Encoding::type encoding = Encoding::PLAIN) {
sink_.reset(new InMemoryOutputStream());
std::unique_ptr<SerializedPageWriter> pager(
new SerializedPageWriter(sink_.get(), Compression::UNCOMPRESSED, &metadata_));
return std::unique_ptr<TypedColumnWriter<TestType>>(
new TypedColumnWriter<TestType>(schema_.get(), std::move(pager), output_size,
encoding, writer_properties_.get()));
WriterProperties::Builder wp_builder;
if (encoding == Encoding::PLAIN_DICTIONARY || encoding == Encoding::RLE_DICTIONARY) {
wp_builder.enable_dictionary();
} else {
wp_builder.disable_dictionary();
wp_builder.encoding(encoding);
}
writer_properties_ = wp_builder.build();
std::shared_ptr<ColumnWriter> writer = ColumnWriter::Make(
schema_.get(), std::move(pager), output_size, writer_properties_.get());
return std::static_pointer_cast<TypedColumnWriter<TestType>>(writer);
}

void SyncValuesOut();
Expand All @@ -106,7 +114,7 @@ class TestPrimitiveWriter : public ::testing::Test {
this->GenerateData(SMALL_SIZE);

// Test case 1: required and non-repeated, so no definition or repetition levels
std::unique_ptr<TypedColumnWriter<TestType>> writer =
std::shared_ptr<TypedColumnWriter<TestType>> writer =
this->BuildWriter(SMALL_SIZE, encoding);
writer->WriteBatch(this->values_.size(), nullptr, nullptr, this->values_ptr_);
// The behaviour should be independent from the number of Close() calls
Expand Down Expand Up @@ -191,20 +199,11 @@ typedef ::testing::Types<Int32Type, Int64Type, Int96Type, FloatType, DoubleType,

TYPED_TEST_CASE(TestPrimitiveWriter, TestTypes);

// Dictionary encoding for booleans is not supported.
typedef ::testing::Types<Int32Type, Int64Type, Int96Type, FloatType, DoubleType,
ByteArrayType, FLBAType> TestDictionaryTypes;

template <typename T>
class TestPrimitiveDictionaryWriter : public TestPrimitiveWriter<T> {};

TYPED_TEST_CASE(TestPrimitiveDictionaryWriter, TestDictionaryTypes);

TYPED_TEST(TestPrimitiveWriter, RequiredPlain) {
this->TestRequiredWithEncoding(Encoding::PLAIN);
}

TYPED_TEST(TestPrimitiveDictionaryWriter, RequiredDictionary) {
TYPED_TEST(TestPrimitiveWriter, RequiredDictionary) {
this->TestRequiredWithEncoding(Encoding::PLAIN_DICTIONARY);
}

Expand Down
3 changes: 2 additions & 1 deletion cpp/src/parquet/column/writer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,8 @@ std::shared_ptr<ColumnWriter> ColumnWriter::Make(const ColumnDescriptor* descr,
std::unique_ptr<PageWriter> pager, int64_t expected_rows,
const WriterProperties* properties) {
Encoding::type encoding = properties->encoding(descr->path());
if (properties->dictionary_enabled(descr->path())) {
if (properties->dictionary_enabled(descr->path()) &&
descr->physical_type() != Type::BOOLEAN) {
encoding = properties->dictionary_page_encoding();
}
switch (descr->physical_type()) {
Expand Down

0 comments on commit 265215d

Please sign in to comment.