Skip to content

Commit

Permalink
HIVE-26658: INT64 Parquet timestamps cannot be mapped to most Hive nu…
Browse files Browse the repository at this point in the history
…meric types (Stamatis Zampetakis reviewed by Chris Nauroth, Steve Carlin, Ayush Saxena)

Closes apache#3698
  • Loading branch information
zabetak authored and yeahyung committed Jul 20, 2023
1 parent 9e395f6 commit d18d88c
Show file tree
Hide file tree
Showing 4 changed files with 295 additions and 42 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
import org.apache.hadoop.hive.serde2.typeinfo.HiveDecimalUtils;
import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo;
import org.apache.hadoop.io.BooleanWritable;
Expand Down Expand Up @@ -448,6 +449,21 @@ public void addLong(long value) {
}
}
};
case serdeConstants.TIMESTAMP_TYPE_NAME:
case serdeConstants.TIMESTAMPLOCALTZ_TYPE_NAME:
if (type.getLogicalTypeAnnotation() instanceof TimestampLogicalTypeAnnotation) {
TimestampLogicalTypeAnnotation logicalType =
(TimestampLogicalTypeAnnotation) type.getLogicalTypeAnnotation();
return new PrimitiveConverter() {
@Override
public void addLong(final long value) {
Timestamp timestamp =
ParquetTimestampUtils.getTimestamp(value, logicalType.getUnit(), logicalType.isAdjustedToUTC());
parent.set(index, new TimestampWritableV2(timestamp));
}
};
}
throw new IllegalStateException("Cannot reliably convert INT64 value to timestamp without type annotation");
default:
return new PrimitiveConverter() {
@Override
Expand Down Expand Up @@ -743,40 +759,6 @@ protected TimestampWritableV2 convert(Binary binary) {
};
}
},
EINT64_TIMESTAMP_CONVERTER(TimestampWritableV2.class) {
@Override
PrimitiveConverter getConverter(final PrimitiveType type, final int index, final ConverterParent parent,
TypeInfo hiveTypeInfo) {
if (hiveTypeInfo != null) {
String typeName = TypeInfoUtils.getBaseName(hiveTypeInfo.getTypeName());
final long min = getMinValue(type, typeName, Long.MIN_VALUE);
final long max = getMaxValue(typeName, Long.MAX_VALUE);

switch (typeName) {
case serdeConstants.BIGINT_TYPE_NAME:
return new PrimitiveConverter() {
@Override
public void addLong(long value) {
if ((value >= min) && (value <= max)) {
parent.set(index, new LongWritable(value));
} else {
parent.set(index, null);
}
}
};
}
}
return new PrimitiveConverter() {
@Override
public void addLong(final long value) {
TimestampLogicalTypeAnnotation logicalType = (TimestampLogicalTypeAnnotation) type.getLogicalTypeAnnotation();
Timestamp timestamp =
ParquetTimestampUtils.getTimestamp(value, logicalType.getUnit(), logicalType.isAdjustedToUTC());
parent.set(index, new TimestampWritableV2(timestamp));
}
};
}
},
EDATE_CONVERTER(DateWritableV2.class) {
@Override
PrimitiveConverter getConverter(final PrimitiveType type, final int index, final ConverterParent parent, TypeInfo hiveTypeInfo) {
Expand Down Expand Up @@ -833,7 +815,8 @@ public Optional<PrimitiveConverter> visit(DateLogicalTypeAnnotation logicalTypeA

@Override
public Optional<PrimitiveConverter> visit(TimestampLogicalTypeAnnotation logicalTypeAnnotation) {
return Optional.of(EINT64_TIMESTAMP_CONVERTER.getConverter(type, index, parent, hiveTypeInfo));
TypeInfo info = hiveTypeInfo == null ? TypeInfoFactory.timestampTypeInfo : hiveTypeInfo;
return Optional.of(EINT64_CONVERTER.getConverter(type, index, parent, info));
}
});

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

package org.apache.hadoop.hive.ql.io.parquet.convert;

import static org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory.getPrimitiveTypeInfo;
import static org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory.stringTypeInfo;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
Expand All @@ -27,6 +28,7 @@
import java.time.ZoneId;
import java.time.ZoneOffset;

import org.apache.hadoop.hive.common.type.HiveDecimal;
import org.apache.hadoop.hive.common.type.Timestamp;
import org.apache.hadoop.hive.ql.io.parquet.convert.ETypeConverter.BinaryConverter;
import org.apache.hadoop.hive.ql.io.parquet.timestamp.NanoTime;
Expand All @@ -39,6 +41,7 @@
import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo;
import org.apache.hadoop.io.BooleanWritable;
import org.apache.hadoop.io.BytesWritable;
Expand Down Expand Up @@ -115,16 +118,84 @@ public void testGetDecimalConverterDoubleHiveType() throws Exception {
assertEquals(22, (int) doubleWritable.get());
}

@Test
public void testGetInt64TimestampConverterTinyIntHiveType() {
testGetInt64TimestampConverterNumericHiveType("1970-01-01 00:00:00.005", "tinyint", 5);
}

@Test
public void testGetInt64TimestampConverterSmallIntHiveType() {
testGetInt64TimestampConverterNumericHiveType("1970-01-01 00:00:00.005", "smallint", 5);
}

@Test
public void testGetInt64TimestampConverterIntHiveType() {
testGetInt64TimestampConverterNumericHiveType("1970-01-01 00:00:00.005", "int", 5);
}

@Test
public void testGetInt64TimestampConverterBigIntHiveType() {
Timestamp timestamp = Timestamp.valueOf("1998-10-03 09:58:31.231");
long msTime = timestamp.toEpochMilli();
// Need TimeStamp logicalType annotation here
testGetInt64TimestampConverterNumericHiveType("1998-10-03 09:58:31.231", "bigint", 907408711231L);
}

@Test
public void testGetInt64TimestampConverterFloatHiveType() {
testGetInt64TimestampConverterNumericHiveType("1970-01-01 00:00:00.005", "float", 5.0f);
}

@Test
public void testGetInt64TimestampConverterDoubleHiveType() {
testGetInt64TimestampConverterNumericHiveType("1970-01-01 00:00:00.005", "double", 5.0d);
}

@Test
public void testGetInt64TimestampConverterDecimalHiveType() {
testGetInt64TimestampConverterNumericHiveType("1970-01-01 00:00:00.005", "decimal(1,0)", HiveDecimal.create(5));
}

@Test
public void testGetInt64TimestampConverterNoHiveType() {
Timestamp ts = Timestamp.valueOf("2022-10-24 11:35:00.005");
PrimitiveType primitiveType = createInt64TimestampType(false, TimeUnit.MILLIS);
Writable writable = getWritableFromPrimitiveConverter(createHiveTypeInfo("bigint"), primitiveType, msTime);
// Retrieve as BigInt
LongWritable longWritable = (LongWritable) writable;
assertEquals(msTime, longWritable.get());
Writable writable = getWritableFromPrimitiveConverter(null, primitiveType, ts.toEpochMilli());
assertEquals("2022-10-24 11:35:00.005", ((TimestampWritableV2) writable).getTimestamp().toString());
}

@Test(expected = IllegalStateException.class)
public void testGetInt64NoLogicalAnnotationTimestampHiveType() {
Timestamp ts = Timestamp.valueOf("2022-10-24 11:43:00.005");
PrimitiveType primitiveType = Types.optional(PrimitiveTypeName.INT64).named("int64");
getWritableFromPrimitiveConverter(TypeInfoFactory.timestampTypeInfo, primitiveType, ts.toEpochMilli());
}

private void testGetInt64TimestampConverterNumericHiveType(String timestamp, String type, Object expected) {
Timestamp ts = Timestamp.valueOf(timestamp);
PrimitiveType primitiveType = createInt64TimestampType(false, TimeUnit.MILLIS);
PrimitiveTypeInfo info = getPrimitiveTypeInfo(type);
Writable writable = getWritableFromPrimitiveConverter(info, primitiveType, ts.toEpochMilli());
final Object actual;
switch (info.getPrimitiveCategory()) {
case BYTE:
case SHORT:
case INT:
actual = ((IntWritable) writable).get();
break;
case LONG:
actual = ((LongWritable) writable).get();
break;
case FLOAT:
actual = ((FloatWritable) writable).get();
break;
case DOUBLE:
actual = ((DoubleWritable) writable).get();
break;
case DECIMAL:
actual = ((HiveDecimalWritable) writable).getHiveDecimal();
break;
default:
throw new IllegalStateException(info.toString());
}
assertEquals(expected, actual);
}

@Test
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
set hive.parquet.write.int64.timestamp=true;
set hive.parquet.timestamp.time.unit=micros;
CREATE TABLE hive_26658_table (ts TIMESTAMP) STORED AS PARQUET;

INSERT INTO hive_26658_table VALUES ('2022-10-21 15:58:32');
INSERT INTO hive_26658_table VALUES ('1970-01-01 00:00:00.000009');

SELECT * FROM hive_26658_table;

set metastore.disallow.incompatible.col.type.changes=false;
ALTER TABLE hive_26658_table CHANGE ts ts TINYINT;

SELECT * FROM hive_26658_table;

ALTER TABLE hive_26658_table CHANGE ts ts SMALLINT;

SELECT * FROM hive_26658_table;

ALTER TABLE hive_26658_table CHANGE ts ts INT;

SELECT * FROM hive_26658_table;

ALTER TABLE hive_26658_table CHANGE ts ts BIGINT;

SELECT * FROM hive_26658_table;

ALTER TABLE hive_26658_table CHANGE ts ts DOUBLE;

SELECT * FROM hive_26658_table;

ALTER TABLE hive_26658_table CHANGE ts ts FLOAT;

SELECT * FROM hive_26658_table;

ALTER TABLE hive_26658_table CHANGE ts ts Decimal;

SELECT * FROM hive_26658_table;
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
PREHOOK: query: CREATE TABLE hive_26658_table (ts TIMESTAMP) STORED AS PARQUET
PREHOOK: type: CREATETABLE
PREHOOK: Output: database:default
PREHOOK: Output: default@hive_26658_table
POSTHOOK: query: CREATE TABLE hive_26658_table (ts TIMESTAMP) STORED AS PARQUET
POSTHOOK: type: CREATETABLE
POSTHOOK: Output: database:default
POSTHOOK: Output: default@hive_26658_table
PREHOOK: query: INSERT INTO hive_26658_table VALUES ('2022-10-21 15:58:32')
PREHOOK: type: QUERY
PREHOOK: Input: _dummy_database@_dummy_table
PREHOOK: Output: default@hive_26658_table
POSTHOOK: query: INSERT INTO hive_26658_table VALUES ('2022-10-21 15:58:32')
POSTHOOK: type: QUERY
POSTHOOK: Input: _dummy_database@_dummy_table
POSTHOOK: Output: default@hive_26658_table
POSTHOOK: Lineage: hive_26658_table.ts SCRIPT []
PREHOOK: query: INSERT INTO hive_26658_table VALUES ('1970-01-01 00:00:00.000009')
PREHOOK: type: QUERY
PREHOOK: Input: _dummy_database@_dummy_table
PREHOOK: Output: default@hive_26658_table
POSTHOOK: query: INSERT INTO hive_26658_table VALUES ('1970-01-01 00:00:00.000009')
POSTHOOK: type: QUERY
POSTHOOK: Input: _dummy_database@_dummy_table
POSTHOOK: Output: default@hive_26658_table
POSTHOOK: Lineage: hive_26658_table.ts SCRIPT []
PREHOOK: query: SELECT * FROM hive_26658_table
PREHOOK: type: QUERY
PREHOOK: Input: default@hive_26658_table
#### A masked pattern was here ####
POSTHOOK: query: SELECT * FROM hive_26658_table
POSTHOOK: type: QUERY
POSTHOOK: Input: default@hive_26658_table
#### A masked pattern was here ####
2022-10-21 15:58:32
1970-01-01 00:00:00.000009
PREHOOK: query: ALTER TABLE hive_26658_table CHANGE ts ts TINYINT
PREHOOK: type: ALTERTABLE_RENAMECOL
PREHOOK: Input: default@hive_26658_table
PREHOOK: Output: default@hive_26658_table
POSTHOOK: query: ALTER TABLE hive_26658_table CHANGE ts ts TINYINT
POSTHOOK: type: ALTERTABLE_RENAMECOL
POSTHOOK: Input: default@hive_26658_table
POSTHOOK: Output: default@hive_26658_table
PREHOOK: query: SELECT * FROM hive_26658_table
PREHOOK: type: QUERY
PREHOOK: Input: default@hive_26658_table
#### A masked pattern was here ####
POSTHOOK: query: SELECT * FROM hive_26658_table
POSTHOOK: type: QUERY
POSTHOOK: Input: default@hive_26658_table
#### A masked pattern was here ####
NULL
9
PREHOOK: query: ALTER TABLE hive_26658_table CHANGE ts ts SMALLINT
PREHOOK: type: ALTERTABLE_RENAMECOL
PREHOOK: Input: default@hive_26658_table
PREHOOK: Output: default@hive_26658_table
POSTHOOK: query: ALTER TABLE hive_26658_table CHANGE ts ts SMALLINT
POSTHOOK: type: ALTERTABLE_RENAMECOL
POSTHOOK: Input: default@hive_26658_table
POSTHOOK: Output: default@hive_26658_table
PREHOOK: query: SELECT * FROM hive_26658_table
PREHOOK: type: QUERY
PREHOOK: Input: default@hive_26658_table
#### A masked pattern was here ####
POSTHOOK: query: SELECT * FROM hive_26658_table
POSTHOOK: type: QUERY
POSTHOOK: Input: default@hive_26658_table
#### A masked pattern was here ####
NULL
9
PREHOOK: query: ALTER TABLE hive_26658_table CHANGE ts ts INT
PREHOOK: type: ALTERTABLE_RENAMECOL
PREHOOK: Input: default@hive_26658_table
PREHOOK: Output: default@hive_26658_table
POSTHOOK: query: ALTER TABLE hive_26658_table CHANGE ts ts INT
POSTHOOK: type: ALTERTABLE_RENAMECOL
POSTHOOK: Input: default@hive_26658_table
POSTHOOK: Output: default@hive_26658_table
PREHOOK: query: SELECT * FROM hive_26658_table
PREHOOK: type: QUERY
PREHOOK: Input: default@hive_26658_table
#### A masked pattern was here ####
POSTHOOK: query: SELECT * FROM hive_26658_table
POSTHOOK: type: QUERY
POSTHOOK: Input: default@hive_26658_table
#### A masked pattern was here ####
NULL
9
PREHOOK: query: ALTER TABLE hive_26658_table CHANGE ts ts BIGINT
PREHOOK: type: ALTERTABLE_RENAMECOL
PREHOOK: Input: default@hive_26658_table
PREHOOK: Output: default@hive_26658_table
POSTHOOK: query: ALTER TABLE hive_26658_table CHANGE ts ts BIGINT
POSTHOOK: type: ALTERTABLE_RENAMECOL
POSTHOOK: Input: default@hive_26658_table
POSTHOOK: Output: default@hive_26658_table
PREHOOK: query: SELECT * FROM hive_26658_table
PREHOOK: type: QUERY
PREHOOK: Input: default@hive_26658_table
#### A masked pattern was here ####
POSTHOOK: query: SELECT * FROM hive_26658_table
POSTHOOK: type: QUERY
POSTHOOK: Input: default@hive_26658_table
#### A masked pattern was here ####
1666367912000000
9
PREHOOK: query: ALTER TABLE hive_26658_table CHANGE ts ts DOUBLE
PREHOOK: type: ALTERTABLE_RENAMECOL
PREHOOK: Input: default@hive_26658_table
PREHOOK: Output: default@hive_26658_table
POSTHOOK: query: ALTER TABLE hive_26658_table CHANGE ts ts DOUBLE
POSTHOOK: type: ALTERTABLE_RENAMECOL
POSTHOOK: Input: default@hive_26658_table
POSTHOOK: Output: default@hive_26658_table
PREHOOK: query: SELECT * FROM hive_26658_table
PREHOOK: type: QUERY
PREHOOK: Input: default@hive_26658_table
#### A masked pattern was here ####
POSTHOOK: query: SELECT * FROM hive_26658_table
POSTHOOK: type: QUERY
POSTHOOK: Input: default@hive_26658_table
#### A masked pattern was here ####
1.666367912E15
9.0
PREHOOK: query: ALTER TABLE hive_26658_table CHANGE ts ts FLOAT
PREHOOK: type: ALTERTABLE_RENAMECOL
PREHOOK: Input: default@hive_26658_table
PREHOOK: Output: default@hive_26658_table
POSTHOOK: query: ALTER TABLE hive_26658_table CHANGE ts ts FLOAT
POSTHOOK: type: ALTERTABLE_RENAMECOL
POSTHOOK: Input: default@hive_26658_table
POSTHOOK: Output: default@hive_26658_table
PREHOOK: query: SELECT * FROM hive_26658_table
PREHOOK: type: QUERY
PREHOOK: Input: default@hive_26658_table
#### A masked pattern was here ####
POSTHOOK: query: SELECT * FROM hive_26658_table
POSTHOOK: type: QUERY
POSTHOOK: Input: default@hive_26658_table
#### A masked pattern was here ####
1.66636785E15
9.0
PREHOOK: query: ALTER TABLE hive_26658_table CHANGE ts ts Decimal
PREHOOK: type: ALTERTABLE_RENAMECOL
PREHOOK: Input: default@hive_26658_table
PREHOOK: Output: default@hive_26658_table
POSTHOOK: query: ALTER TABLE hive_26658_table CHANGE ts ts Decimal
POSTHOOK: type: ALTERTABLE_RENAMECOL
POSTHOOK: Input: default@hive_26658_table
POSTHOOK: Output: default@hive_26658_table
PREHOOK: query: SELECT * FROM hive_26658_table
PREHOOK: type: QUERY
PREHOOK: Input: default@hive_26658_table
#### A masked pattern was here ####
POSTHOOK: query: SELECT * FROM hive_26658_table
POSTHOOK: type: QUERY
POSTHOOK: Input: default@hive_26658_table
#### A masked pattern was here ####
NULL
9

0 comments on commit d18d88c

Please sign in to comment.