Skip to content

Commit

Permalink
Use specific encoding to detect dictionary columns in Parquet
Browse files Browse the repository at this point in the history
The existing checks were faulty and caused non-dictionary encoded pages to be
filtered out.
  • Loading branch information
rdblue authored and dain committed Apr 22, 2016
1 parent a8601a8 commit a280baf
Show file tree
Hide file tree
Showing 2 changed files with 77 additions and 10 deletions.
Expand Up @@ -21,8 +21,11 @@
import com.facebook.presto.spi.predicate.TupleDomain;
import com.facebook.presto.spi.type.Type;
import com.facebook.presto.spi.type.TypeManager;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Sets;
import com.google.common.primitives.Ints;
import org.apache.hadoop.conf.Configuration;
import parquet.bytes.BytesInput;
Expand All @@ -46,11 +49,12 @@
import java.util.Map;
import java.util.Set;

import static parquet.column.Encoding.BIT_PACKED;
import static parquet.column.Encoding.PLAIN_DICTIONARY;
import static parquet.column.Encoding.RLE;

public final class ParquetPredicateUtils
{
// definition level, repetition level, value
private static final int PARQUET_DATA_TRIPLE = 3;

private ParquetPredicateUtils()
{
}
Expand Down Expand Up @@ -188,14 +192,22 @@ private static boolean isColumnPredicate(ColumnDescriptor columnDescriptor, Tupl
.anyMatch(columnName::equals);
}

private static boolean isOnlyDictionaryEncodingPages(Set<Encoding> encodings)
@VisibleForTesting
@SuppressWarnings("deprecation")
static boolean isOnlyDictionaryEncodingPages(Set<Encoding> encodings)
{
// more than 1 encodings for values
if (encodings.size() > PARQUET_DATA_TRIPLE) {
return false;
// TODO: update to use EncodingStats in ColumnChunkMetaData when available
if (encodings.contains(PLAIN_DICTIONARY)) {
// PLAIN_DICTIONARY was present, which means at least one page was
// dictionary-encoded and 1.0 encodings are used
// The only other allowed encodings are RLE and BIT_PACKED which are used for repetition or definition levels
return Sets.difference(encodings, ImmutableSet.of(PLAIN_DICTIONARY, RLE, BIT_PACKED)).isEmpty();
}
// definition level, repetition level never have dictionary encoding
// TODO: add PageEncodingStats in ColumnChunkMetaData
return encodings.stream().anyMatch(Encoding::usesDictionary);

// if PLAIN_DICTIONARY wasn't present, then either the column is not
// dictionary-encoded, or the 2.0 encoding, RLE_DICTIONARY, was used.
// for 2.0, this cannot determine whether a page fell back without
// page encoding stats
return false;
}
}
@@ -0,0 +1,55 @@
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.facebook.presto.hive.parquet.predicate;

import com.google.common.collect.ImmutableSet;
import org.testng.annotations.Test;
import parquet.column.Encoding;

import java.util.Set;

import static com.facebook.presto.hive.parquet.predicate.ParquetPredicateUtils.isOnlyDictionaryEncodingPages;
import static com.google.common.collect.Sets.union;
import static org.testng.Assert.assertFalse;
import static org.testng.Assert.assertTrue;
import static parquet.column.Encoding.BIT_PACKED;
import static parquet.column.Encoding.PLAIN;
import static parquet.column.Encoding.PLAIN_DICTIONARY;
import static parquet.column.Encoding.RLE;

public class TestParquetPredicateUtils
{
@Test
@SuppressWarnings("deprecation")
public void testDictionaryEncodingCasesV1()
{
Set<Encoding> required = ImmutableSet.of(BIT_PACKED);
Set<Encoding> optional = ImmutableSet.of(BIT_PACKED, RLE);
Set<Encoding> repeated = ImmutableSet.of(RLE);

Set<Encoding> notDictionary = ImmutableSet.of(PLAIN);
Set<Encoding> mixedDictionary = ImmutableSet.of(PLAIN_DICTIONARY, PLAIN);
Set<Encoding> dictionary = ImmutableSet.of(PLAIN_DICTIONARY);

assertFalse(isOnlyDictionaryEncodingPages(union(required, notDictionary)), "required notDictionary");
assertFalse(isOnlyDictionaryEncodingPages(union(optional, notDictionary)), "optional notDictionary");
assertFalse(isOnlyDictionaryEncodingPages(union(repeated, notDictionary)), "repeated notDictionary");
assertFalse(isOnlyDictionaryEncodingPages(union(required, mixedDictionary)), "required mixedDictionary");
assertFalse(isOnlyDictionaryEncodingPages(union(optional, mixedDictionary)), "optional mixedDictionary");
assertFalse(isOnlyDictionaryEncodingPages(union(repeated, mixedDictionary)), "repeated mixedDictionary");
assertTrue(isOnlyDictionaryEncodingPages(union(required, dictionary)), "required dictionary");
assertTrue(isOnlyDictionaryEncodingPages(union(optional, dictionary)), "optional dictionary");
assertTrue(isOnlyDictionaryEncodingPages(union(repeated, dictionary)), "repeated dictionary");
}
}

0 comments on commit a280baf

Please sign in to comment.