Skip to content

Commit

Permalink
Skip reading Parquet pages using Column Indexes feature of Parquet
Browse files Browse the repository at this point in the history
* Add tests verifying equivalent results with and without feature enabled
* Fix column synchronization logic
* Take into account empty min/max values
* Fix issues around type casts of numeric values
* Remove ColumnIndexFilterUtils and replace with FilteredOffsetIndex
* Add support for Decimal type
* Remove reference to INT96 since it's not supported for column indexes
  • Loading branch information
James Taylor authored and martint committed Aug 18, 2021
1 parent 39100a9 commit 6eb42f2
Show file tree
Hide file tree
Showing 32 changed files with 3,772 additions and 171 deletions.
Expand Up @@ -233,6 +233,14 @@ static SortedRangeSet of(Range first, Range... rest)
return copyOf(first.getType(), rangeList);
}

static SortedRangeSet of(List<Range> rangeList)
{
if (rangeList.isEmpty()) {
throw new IllegalArgumentException("cannot use empty rangeList");
}
return copyOf(rangeList.get(0).getType(), rangeList);
}

private static SortedRangeSet of(Type type, Object value)
{
checkNotNaN(type, value);
Expand Down
Expand Up @@ -79,6 +79,11 @@ static ValueSet ofRanges(Range first, Range... rest)
return SortedRangeSet.of(first, rest);
}

static ValueSet ofRanges(List<Range> ranges)
{
return SortedRangeSet.of(ranges);
}

static ValueSet copyOfRanges(Type type, Collection<Range> ranges)
{
return SortedRangeSet.copyOf(type, ranges);
Expand Down
14 changes: 13 additions & 1 deletion lib/trino-parquet/src/main/java/io/trino/parquet/DataPage.java
Expand Up @@ -13,15 +13,27 @@
*/
package io.trino.parquet;

import java.util.OptionalLong;

public abstract class DataPage
extends Page
{
protected final int valueCount;
private final OptionalLong firstRowIndex;

public DataPage(int uncompressedSize, int valueCount)
public DataPage(int uncompressedSize, int valueCount, OptionalLong firstRowIndex)
{
super(uncompressedSize);
this.valueCount = valueCount;
this.firstRowIndex = firstRowIndex;
}

/**
* @return the index of the first row index in this page or -1 if unset.
*/
public OptionalLong getFirstRowIndex()
{
return firstRowIndex;
}

public int getValueCount()
Expand Down
Expand Up @@ -15,6 +15,8 @@

import io.airlift.slice.Slice;

import java.util.OptionalLong;

import static com.google.common.base.MoreObjects.toStringHelper;
import static java.util.Objects.requireNonNull;

Expand All @@ -30,11 +32,12 @@ public DataPageV1(
Slice slice,
int valueCount,
int uncompressedSize,
OptionalLong firstRowIndex,
ParquetEncoding repetitionLevelEncoding,
ParquetEncoding definitionLevelEncoding,
ParquetEncoding valuesEncoding)
{
super(uncompressedSize, valueCount);
super(uncompressedSize, valueCount, firstRowIndex);
this.slice = requireNonNull(slice, "slice is null");
this.repetitionLevelEncoding = repetitionLevelEncoding;
this.definitionLevelEncoding = definitionLevelEncoding;
Expand Down
Expand Up @@ -16,6 +16,8 @@
import io.airlift.slice.Slice;
import org.apache.parquet.column.statistics.Statistics;

import java.util.OptionalLong;

import static com.google.common.base.MoreObjects.toStringHelper;
import static java.util.Objects.requireNonNull;

Expand All @@ -40,10 +42,11 @@ public DataPageV2(
ParquetEncoding dataEncoding,
Slice slice,
int uncompressedSize,
OptionalLong firstRowIndex,
Statistics<?> statistics,
boolean isCompressed)
{
super(uncompressedSize, valueCount);
super(uncompressedSize, valueCount, firstRowIndex);
this.rowCount = rowCount;
this.nullCount = nullCount;
this.repetitionLevels = requireNonNull(repetitionLevels, "repetitionLevels slice is null");
Expand Down
Expand Up @@ -13,11 +13,11 @@
*/
package io.trino.parquet;

import com.google.common.collect.ListMultimap;
import io.airlift.slice.Slice;

import java.io.Closeable;
import java.io.IOException;
import java.util.Map;

public interface ParquetDataSource
extends Closeable
Expand All @@ -34,7 +34,7 @@ public interface ParquetDataSource

Slice readFully(long position, int length);

<K> Map<K, ChunkReader> planRead(Map<K, DiskRange> diskRanges);
<K> ListMultimap<K, ChunkReader> planRead(ListMultimap<K, DiskRange> diskRanges);

@Override
default void close()
Expand Down
Expand Up @@ -28,25 +28,29 @@ public class ParquetReaderOptions
private final DataSize maxReadBlockSize;
private final DataSize maxMergeDistance;
private final DataSize maxBufferSize;
private final boolean useColumnIndex;

public ParquetReaderOptions()
{
ignoreStatistics = false;
maxReadBlockSize = DEFAULT_MAX_READ_BLOCK_SIZE;
maxMergeDistance = DEFAULT_MAX_MERGE_DISTANCE;
maxBufferSize = DEFAULT_MAX_BUFFER_SIZE;
useColumnIndex = true;
}

private ParquetReaderOptions(
boolean ignoreStatistics,
DataSize maxReadBlockSize,
DataSize maxMergeDistance,
DataSize maxBufferSize)
DataSize maxBufferSize,
boolean useColumnIndex)
{
this.ignoreStatistics = ignoreStatistics;
this.maxReadBlockSize = requireNonNull(maxReadBlockSize, "maxReadBlockSize is null");
this.maxMergeDistance = requireNonNull(maxMergeDistance, "maxMergeDistance is null");
this.maxBufferSize = requireNonNull(maxBufferSize, "maxBufferSize is null");
this.useColumnIndex = useColumnIndex;
}

public boolean isIgnoreStatistics()
Expand All @@ -64,6 +68,11 @@ public DataSize getMaxMergeDistance()
return maxMergeDistance;
}

public boolean isUseColumnIndex()
{
return useColumnIndex;
}

public DataSize getMaxBufferSize()
{
return maxBufferSize;
Expand All @@ -75,7 +84,8 @@ public ParquetReaderOptions withIgnoreStatistics(boolean ignoreStatistics)
ignoreStatistics,
maxReadBlockSize,
maxMergeDistance,
maxBufferSize);
maxBufferSize,
useColumnIndex);
}

public ParquetReaderOptions withMaxReadBlockSize(DataSize maxReadBlockSize)
Expand All @@ -84,7 +94,8 @@ public ParquetReaderOptions withMaxReadBlockSize(DataSize maxReadBlockSize)
ignoreStatistics,
maxReadBlockSize,
maxMergeDistance,
maxBufferSize);
maxBufferSize,
useColumnIndex);
}

public ParquetReaderOptions withMaxMergeDistance(DataSize maxMergeDistance)
Expand All @@ -93,7 +104,8 @@ public ParquetReaderOptions withMaxMergeDistance(DataSize maxMergeDistance)
ignoreStatistics,
maxReadBlockSize,
maxMergeDistance,
maxBufferSize);
maxBufferSize,
useColumnIndex);
}

public ParquetReaderOptions withMaxBufferSize(DataSize maxBufferSize)
Expand All @@ -102,6 +114,17 @@ public ParquetReaderOptions withMaxBufferSize(DataSize maxBufferSize)
ignoreStatistics,
maxReadBlockSize,
maxMergeDistance,
maxBufferSize);
maxBufferSize,
useColumnIndex);
}

public ParquetReaderOptions withUseColumnIndex(boolean useColumnIndex)
{
return new ParquetReaderOptions(
ignoreStatistics,
maxReadBlockSize,
maxMergeDistance,
maxBufferSize,
useColumnIndex);
}
}
Expand Up @@ -13,6 +13,7 @@
*/
package io.trino.parquet;

import io.airlift.slice.Slice;
import io.trino.spi.type.DecimalType;
import org.apache.parquet.column.Encoding;
import org.apache.parquet.io.ColumnIO;
Expand All @@ -25,13 +26,16 @@
import org.apache.parquet.schema.GroupType;
import org.apache.parquet.schema.MessageType;

import java.math.BigInteger;
import java.nio.ByteBuffer;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;

import static com.google.common.base.Preconditions.checkArgument;
import static io.trino.spi.type.UnscaledDecimal128Arithmetic.unscaledDecimal;
import static org.apache.parquet.schema.OriginalType.DECIMAL;
import static org.apache.parquet.schema.Type.Repetition.REPEATED;

Expand Down Expand Up @@ -241,4 +245,24 @@ public static long getShortDecimalValue(byte[] bytes)

return value;
}

public static Slice getLongDecimalValue(byte[] bytes)
{
BigInteger value = new BigInteger(bytes);
return unscaledDecimal(value);
}

public static long getShortDecimalValue(ByteBuffer buffer)
{
byte[] array = new byte[buffer.remaining()];
buffer.get(array);
return getShortDecimalValue(array);
}

public static Slice getLongDecimalValue(ByteBuffer buffer)
{
byte[] array = new byte[buffer.remaining()];
buffer.get(array);
return getLongDecimalValue(array);
}
}
Expand Up @@ -13,13 +13,13 @@
*/
package io.trino.parquet.predicate;

public class ParquetIntegerStatistics
public class ParquetLongStatistics
implements ParquetRangeStatistics<Long>
{
private final Long minimum;
private final Long maximum;

public ParquetIntegerStatistics(Long minimum, Long maximum)
public ParquetLongStatistics(Long minimum, Long maximum)
{
this.minimum = minimum;
this.maximum = maximum;
Expand Down
Expand Up @@ -15,13 +15,13 @@

import io.airlift.slice.Slice;

public class ParquetStringStatistics
public class ParquetSliceStatistics
implements ParquetRangeStatistics<Slice>
{
private final Slice minimum;
private final Slice maximum;

public ParquetStringStatistics(Slice minimum, Slice maximum)
public ParquetSliceStatistics(Slice minimum, Slice maximum)
{
this.minimum = minimum;
this.maximum = maximum;
Expand Down
Expand Up @@ -17,8 +17,12 @@
import io.trino.parquet.ParquetDataSourceId;
import org.apache.parquet.column.ColumnDescriptor;
import org.apache.parquet.column.statistics.Statistics;
import org.apache.parquet.filter2.predicate.FilterPredicate;
import org.apache.parquet.internal.filter2.columnindex.ColumnIndexStore;
import org.joda.time.DateTimeZone;

import java.util.Map;
import java.util.Optional;

public interface Predicate
{
Expand All @@ -41,4 +45,23 @@ boolean matches(long numberOfRows, Map<ColumnDescriptor, Statistics<?>> statisti
* @param dictionary The single column dictionary
*/
boolean matches(DictionaryDescriptor dictionary);

/**
* Should the Parquet Reader process a file section with the specified statistics.
*
* @param numberOfRows the number of rows in the segment; this can be used with
* Statistics to determine if a column is only null
* @param columnIndex column index (statistics) store
* @param id Parquet file name
*/
boolean matches(long numberOfRows, ColumnIndexStore columnIndex, ParquetDataSourceId id)
throws ParquetCorruptionException;

/**
* Convert Predicate to Parquet filter if possible.
*
* @param timeZone current Parquet timezone
* @return Converted Parquet filter or null if conversion not possible
*/
Optional<FilterPredicate> toParquetFilter(DateTimeZone timeZone);
}

0 comments on commit 6eb42f2

Please sign in to comment.