Skip to content

Commit

Permalink
Support reading BYTE_STREAM_SPLIT encoding in parquet
Browse files Browse the repository at this point in the history
Introduced DoubleApacheParquetValueDecoder and FloatApacheParquetValueDecoder
to handle double and float types respectively
  • Loading branch information
manupatteri authored and raunaqmorarka committed Mar 25, 2024
1 parent 7c5d56e commit 216419d
Show file tree
Hide file tree
Showing 8 changed files with 212 additions and 70 deletions.
Expand Up @@ -23,6 +23,8 @@
import org.apache.parquet.column.ColumnDescriptor;
import org.apache.parquet.column.values.ValuesReader;
import org.apache.parquet.column.values.bitpacking.ByteBitPackingValuesReader;
import org.apache.parquet.column.values.bytestreamsplit.ByteStreamSplitValuesReaderForDouble;
import org.apache.parquet.column.values.bytestreamsplit.ByteStreamSplitValuesReaderForFloat;
import org.apache.parquet.column.values.delta.DeltaBinaryPackingValuesReader;
import org.apache.parquet.column.values.deltalengthbytearray.DeltaLengthByteArrayValuesReader;
import org.apache.parquet.column.values.deltastrings.DeltaByteArrayReader;
Expand All @@ -44,7 +46,9 @@
import static org.apache.parquet.column.values.bitpacking.Packer.BIG_ENDIAN;
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY;
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BOOLEAN;
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.DOUBLE;
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY;
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FLOAT;
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32;
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT64;

Expand Down Expand Up @@ -149,6 +153,20 @@ public Dictionary initDictionary(ColumnDescriptor descriptor, DictionaryPage dic
{
return PLAIN.initDictionary(descriptor, dictionaryPage);
}
},

BYTE_STREAM_SPLIT {
@Override
public ValuesReader getValuesReader(ColumnDescriptor descriptor, ValuesType valuesType)
{
PrimitiveTypeName typeName = descriptor.getPrimitiveType().getPrimitiveTypeName();
checkArgument(typeName == FLOAT || typeName == DOUBLE, "Encoding BYTE_STREAM_SPLIT is only " +
"supported for type FLOAT and DOUBLE");
if (typeName == FLOAT) {
return new ByteStreamSplitValuesReaderForFloat();
}
return new ByteStreamSplitValuesReaderForDouble();
}
};

static final int INT96_TYPE_LENGTH = 12;
Expand Down
Expand Up @@ -162,8 +162,7 @@ public static ParquetEncoding getParquetEncoding(Encoding encoding)
case RLE:
return ParquetEncoding.RLE;
case BYTE_STREAM_SPLIT:
// TODO: https://github.com/trinodb/trino/issues/8357
throw new ParquetDecodingException("Unsupported Parquet encoding: " + encoding);
return ParquetEncoding.BYTE_STREAM_SPLIT;
case BIT_PACKED:
return ParquetEncoding.BIT_PACKED;
case PLAIN_DICTIONARY:
Expand Down
Expand Up @@ -22,6 +22,8 @@
import java.nio.ByteBuffer;

import static io.trino.parquet.ParquetReaderUtils.castToByte;
import static java.lang.Double.doubleToLongBits;
import static java.lang.Float.floatToIntBits;
import static java.util.Objects.requireNonNull;

/**
Expand Down Expand Up @@ -70,4 +72,78 @@ public void skip(int n)
delegate.skip(n);
}
}

public static final class DoubleApacheParquetValueDecoder
implements ValueDecoder<long[]>
{
private final ValuesReader delegate;

public DoubleApacheParquetValueDecoder(ValuesReader delegate)
{
this.delegate = requireNonNull(delegate, "delegate is null");
}

@Override
public void init(SimpleSliceInputStream input)
{
initialize(input, delegate, Double.BYTES);
}

@Override
public void read(long[] values, int offset, int length)
{
for (int i = offset; i < offset + length; i++) {
values[i] = doubleToLongBits(delegate.readDouble());
}
}

@Override
public void skip(int n)
{
delegate.skip(n);
}
}

public static final class FloatApacheParquetValueDecoder
implements ValueDecoder<int[]>
{
private final ValuesReader delegate;

public FloatApacheParquetValueDecoder(ValuesReader delegate)
{
this.delegate = requireNonNull(delegate, "delegate is null");
}

@Override
public void init(SimpleSliceInputStream input)
{
initialize(input, delegate, Float.BYTES);
}

@Override
public void read(int[] values, int offset, int length)
{
for (int i = offset; i < offset + length; i++) {
values[i] = floatToIntBits(delegate.readFloat());
}
}

@Override
public void skip(int n)
{
delegate.skip(n);
}
}

private static void initialize(SimpleSliceInputStream input, ValuesReader reader, int elementSizeInBytes)
{
byte[] buffer = input.readBytes();
try {
int valueCount = buffer.length / elementSizeInBytes;
reader.initFromPage(valueCount, ByteBufferInputStream.wrap(ByteBuffer.wrap(buffer, 0, buffer.length)));
}
catch (IOException e) {
throw new UncheckedIOException(e);
}
}
}
Expand Up @@ -37,6 +37,7 @@
import org.joda.time.DateTimeZone;

import static com.google.common.base.Preconditions.checkArgument;
import static io.trino.parquet.ParquetEncoding.BYTE_STREAM_SPLIT;
import static io.trino.parquet.ParquetEncoding.DELTA_BYTE_ARRAY;
import static io.trino.parquet.ParquetEncoding.PLAIN;
import static io.trino.parquet.ParquetReaderUtils.toByteExact;
Expand All @@ -45,6 +46,8 @@
import static io.trino.parquet.ParquetTypeUtils.getShortDecimalValue;
import static io.trino.parquet.ValuesType.VALUES;
import static io.trino.parquet.reader.decoders.ApacheParquetValueDecoders.BooleanApacheParquetValueDecoder;
import static io.trino.parquet.reader.decoders.ApacheParquetValueDecoders.DoubleApacheParquetValueDecoder;
import static io.trino.parquet.reader.decoders.ApacheParquetValueDecoders.FloatApacheParquetValueDecoder;
import static io.trino.parquet.reader.decoders.DeltaBinaryPackedDecoders.DeltaBinaryPackedByteDecoder;
import static io.trino.parquet.reader.decoders.DeltaBinaryPackedDecoders.DeltaBinaryPackedIntDecoder;
import static io.trino.parquet.reader.decoders.DeltaBinaryPackedDecoders.DeltaBinaryPackedLongDecoder;
Expand Down Expand Up @@ -112,6 +115,9 @@ public ValueDecoder<long[]> getDoubleDecoder(ParquetEncoding encoding)
if (PLAIN.equals(encoding)) {
return new LongPlainValueDecoder();
}
else if (BYTE_STREAM_SPLIT.equals(encoding)) {
return new DoubleApacheParquetValueDecoder(getApacheParquetReader(encoding));
}
throw wrongEncoding(encoding);
}

Expand All @@ -120,6 +126,9 @@ public ValueDecoder<int[]> getRealDecoder(ParquetEncoding encoding)
if (PLAIN.equals(encoding)) {
return new IntPlainValueDecoder();
}
else if (BYTE_STREAM_SPLIT.equals(encoding)) {
return new FloatApacheParquetValueDecoder(getApacheParquetReader(encoding));
}
throw wrongEncoding(encoding);
}

Expand Down
@@ -0,0 +1,106 @@
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.trino.parquet.reader;

import com.google.common.collect.ImmutableList;
import com.google.common.io.Resources;
import io.trino.parquet.ParquetDataSource;
import io.trino.parquet.ParquetReaderOptions;
import io.trino.spi.Page;
import io.trino.spi.block.Block;
import io.trino.spi.block.IntArrayBlock;
import io.trino.spi.type.Type;
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
import org.testng.annotations.Test;

import java.io.File;
import java.io.IOException;
import java.net.URISyntaxException;
import java.util.List;
import java.util.Optional;
import java.util.stream.IntStream;

import static io.trino.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext;
import static io.trino.parquet.ParquetTestUtils.createParquetReader;
import static io.trino.spi.type.DoubleType.DOUBLE;
import static io.trino.spi.type.RealType.REAL;
import static io.trino.testing.TestingConnectorSession.SESSION;
import static org.assertj.core.api.Assertions.assertThat;
import static org.testng.Assert.assertEquals;

public class TestByteStreamSplitEncoding
{
@Test
public void testReadFloatDouble()
throws URISyntaxException, IOException
{
List<String> columnNames = ImmutableList.of("columnA", "columnB");
List<Type> types = ImmutableList.of(REAL, DOUBLE);

ParquetDataSource dataSource = new FileParquetDataSource(
new File(Resources.getResource("byte_stream_split_float_and_double.parquet").toURI()),
new ParquetReaderOptions());
ParquetMetadata parquetMetadata = MetadataReader.readFooter(dataSource, Optional.empty());
ParquetReader reader = createParquetReader(dataSource, parquetMetadata, newSimpleAggregatedMemoryContext(), types, columnNames);

readAndCompare(reader, getExpectedValues());
}

private static List<List<Double>> getExpectedValues()
{
ImmutableList.Builder<Double> floatsBuilder = ImmutableList.builder();
for (int i = 0; i < 10; i++) {
IntStream.range(0, 10)
.mapToDouble(j -> j * 1.3)
.forEach(floatsBuilder::add);
}

ImmutableList.Builder<Double> doublesBuilder = ImmutableList.builder();
for (int i = 0; i < 10; i++) {
IntStream.range(0, 10)
.mapToDouble(j -> j * 1.5)
.forEach(doublesBuilder::add);
}
return ImmutableList.of(floatsBuilder.build(), doublesBuilder.build());
}

private static void readAndCompare(ParquetReader reader, List<List<Double>> expected)
throws IOException
{
int rowCount = 0;
int pageCount = 0;
Page page = reader.nextPage();
while (page != null) {
assertThat(page.getChannelCount()).isEqualTo(2);
if (pageCount % 2 == 1) { // Skip loading every alternative page
for (int channel = 0; channel < page.getChannelCount(); channel++) {
Block block = page.getBlock(channel).getLoadedBlock();
List<Double> expectedValues = expected.get(channel);
for (int postition = 0; postition < block.getPositionCount(); postition++) {
if (block instanceof IntArrayBlock) {
assertEquals(REAL.getObjectValue(SESSION, block, postition), expectedValues.get(rowCount + postition).floatValue());
}
else {
assertEquals(DOUBLE.getObjectValue(SESSION, block, postition), expectedValues.get(rowCount + postition));
}
}
}
}
rowCount += page.getPositionCount();
pageCount++;
page = reader.nextPage();
}
assertThat(rowCount).isEqualTo(100);
}
}
Expand Up @@ -15,18 +15,16 @@

import com.google.common.collect.ImmutableList;
import io.trino.parquet.PrimitiveField;
import io.trino.parquet.reader.SimpleSliceInputStream;
import io.trino.spi.type.DoubleType;
import org.apache.parquet.column.values.ValuesReader;
import org.apache.parquet.column.values.ValuesWriter;

import java.util.OptionalInt;
import java.util.Random;

import static io.trino.parquet.ParquetEncoding.PLAIN;
import static io.trino.parquet.ParquetEncoding.RLE_DICTIONARY;
import static io.trino.parquet.reader.decoders.ApacheParquetValueDecoders.DoubleApacheParquetValueDecoder;
import static io.trino.parquet.reader.flat.LongColumnAdapter.LONG_ADAPTER;
import static java.util.Objects.requireNonNull;
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.DOUBLE;
import static org.assertj.core.api.Assertions.assertThat;

Expand Down Expand Up @@ -108,35 +106,4 @@ private static DataBuffer writeDoubles(ValuesWriter valuesWriter, double[] input

return getWrittenBuffer(valuesWriter);
}

private static final class DoubleApacheParquetValueDecoder
implements ValueDecoder<long[]>
{
private final ValuesReader delegate;

public DoubleApacheParquetValueDecoder(ValuesReader delegate)
{
this.delegate = requireNonNull(delegate, "delegate is null");
}

@Override
public void init(SimpleSliceInputStream input)
{
initialize(input, delegate);
}

@Override
public void read(long[] values, int offset, int length)
{
for (int i = offset; i < offset + length; i++) {
values[i] = Double.doubleToLongBits(delegate.readDouble());
}
}

@Override
public void skip(int n)
{
delegate.skip(n);
}
}
}
Expand Up @@ -15,18 +15,16 @@

import com.google.common.collect.ImmutableList;
import io.trino.parquet.PrimitiveField;
import io.trino.parquet.reader.SimpleSliceInputStream;
import org.apache.parquet.column.values.ValuesReader;
import org.apache.parquet.column.values.ValuesWriter;

import java.util.OptionalInt;
import java.util.Random;

import static io.trino.parquet.ParquetEncoding.PLAIN;
import static io.trino.parquet.ParquetEncoding.RLE_DICTIONARY;
import static io.trino.parquet.reader.decoders.ApacheParquetValueDecoders.FloatApacheParquetValueDecoder;
import static io.trino.parquet.reader.flat.IntColumnAdapter.INT_ADAPTER;
import static io.trino.spi.type.RealType.REAL;
import static java.util.Objects.requireNonNull;
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FLOAT;
import static org.assertj.core.api.Assertions.assertThat;

Expand Down Expand Up @@ -108,35 +106,4 @@ private static DataBuffer writeFloats(ValuesWriter valuesWriter, float[] input)

return getWrittenBuffer(valuesWriter);
}

private static final class FloatApacheParquetValueDecoder
implements ValueDecoder<int[]>
{
private final ValuesReader delegate;

public FloatApacheParquetValueDecoder(ValuesReader delegate)
{
this.delegate = requireNonNull(delegate, "delegate is null");
}

@Override
public void init(SimpleSliceInputStream input)
{
initialize(input, delegate);
}

@Override
public void read(int[] values, int offset, int length)
{
for (int i = offset; i < offset + length; i++) {
values[i] = Float.floatToIntBits(delegate.readFloat());
}
}

@Override
public void skip(int n)
{
delegate.skip(n);
}
}
}
Binary file not shown.

0 comments on commit 216419d

Please sign in to comment.