From 5f4c861179a2079fc46371ed8273398cb3a2675c Mon Sep 17 00:00:00 2001 From: Sourabh Badhya Date: Wed, 12 Jul 2023 13:52:02 +0530 Subject: [PATCH] HIVE-27455: Iceberg: Set COLUMN_STATS_ACCURATE after writing stats for Iceberg tables (Sourabh Badhya, reviewed by Denys Kuzmenko, Krisztian Kasa) Closes #4440 --- .../mr/hive/HiveIcebergStorageHandler.java | 3 ++- .../mr/hive/TestHiveIcebergStatistics.java | 22 +++++++++++++++++ .../positive/truncate_iceberg_table.q.out | 2 +- .../hadoop/hive/ql/stats/BasicStatsTask.java | 9 +++++-- .../hive/ql/stats/ColStatsProcessor.java | 24 +++++++++++++++++-- 5 files changed, 54 insertions(+), 6 deletions(-) diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java index 30a7ca82a8d7..a8d5741ebd4f 100644 --- a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java +++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java @@ -470,10 +470,11 @@ public boolean setColStatistics(org.apache.hadoop.hive.ql.metadata.Table hmsTabl PuffinCompressionCodec.NONE, ImmutableMap.of())); writer.finish(); + return true; } catch (IOException e) { LOG.error(String.valueOf(e)); + return false; } - return false; } @Override diff --git a/iceberg/iceberg-handler/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergStatistics.java b/iceberg/iceberg-handler/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergStatistics.java index da3638eb1115..4ed42dac9fde 100644 --- a/iceberg/iceberg-handler/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergStatistics.java +++ b/iceberg/iceberg-handler/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergStatistics.java @@ -31,6 +31,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.types.Types.NestedField; import org.apache.thrift.TException; import org.junit.Assert; import org.junit.Assume; @@ -207,6 +208,27 @@ public void testStatsRemoved() throws IOException { checkColStat(identifier.name(), "customer_id", false); } + @Test + public void testColumnStatsAccurate() throws Exception { + TableIdentifier identifier = TableIdentifier.of("default", "customers"); + + shell.setHiveSessionValue(HiveConf.ConfVars.HIVESTATSAUTOGATHER.varname, true); + testTables.createTable(shell, identifier.name(), HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, + PartitionSpec.unpartitioned(), fileFormat, ImmutableList.of()); + + String insert = testTables.getInsertQuery(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS, identifier, true); + shell.executeStatement(insert); + + org.apache.hadoop.hive.metastore.api.Table hmsTable = shell.metastore().getTable("default", identifier.name()); + + // Assert whether basic stats and column stats are accurate. + Assert.assertTrue(hmsTable.getParameters().containsKey(StatsSetupConst.COLUMN_STATS_ACCURATE)); + Assert.assertTrue(StatsSetupConst.areBasicStatsUptoDate(hmsTable.getParameters())); + for (NestedField nestedField : HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA.columns()) { + Assert.assertTrue(StatsSetupConst.areColumnStatsUptoDate(hmsTable.getParameters(), nestedField.name())); + } + } + private void checkColStat(String tableName, String colName, boolean accurate) { List rows = shell.executeStatement("DESCRIBE " + tableName + " " + colName); diff --git a/iceberg/iceberg-handler/src/test/results/positive/truncate_iceberg_table.q.out b/iceberg/iceberg-handler/src/test/results/positive/truncate_iceberg_table.q.out index 07e2a34e423f..dfab2edec758 100644 --- a/iceberg/iceberg-handler/src/test/results/positive/truncate_iceberg_table.q.out +++ b/iceberg/iceberg-handler/src/test/results/positive/truncate_iceberg_table.q.out @@ -225,7 +225,7 @@ Retention: 0 #### A masked pattern was here #### Table Type: EXTERNAL_TABLE Table Parameters: - COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"} + COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"id\":\"true\",\"value\":\"true\"}} EXTERNAL TRUE bucketing_version 2 current-schema {\"type\":\"struct\",\"schema-id\":0,\"fields\":[{\"id\":1,\"name\":\"id\",\"required\":false,\"type\":\"int\"},{\"id\":2,\"name\":\"value\",\"required\":false,\"type\":\"string\"}]} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/BasicStatsTask.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/BasicStatsTask.java index 965d107fdde1..ed968edce0cf 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/stats/BasicStatsTask.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/BasicStatsTask.java @@ -105,7 +105,7 @@ public int process(Hive db, Table tbl) throws Exception { LOG.info("Executing stats task"); table = tbl; - return aggregateStats(db); + return aggregateStats(db, tbl); } @Override @@ -264,7 +264,7 @@ public void process(StatsAggregator statsAggregator) throws HiveException, MetaE } } - private int aggregateStats(Hive db) { + private int aggregateStats(Hive db, Table tbl) { StatsAggregator statsAggregator = null; int ret = 0; @@ -314,6 +314,11 @@ private int aggregateStats(Hive db) { } LOG.info("Table " + tableFullName + " stats: [" + toString(p.getPartParameters()) + ']'); + // The table object is assigned to the latest table object. + // So that it can be used by ColStatsProcessor. + // This is only required for unpartitioned tables. + tbl.setTTable(res.getTTable()); + } else { // Partitioned table: // Need to get the old stats of the partition diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/ColStatsProcessor.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/ColStatsProcessor.java index e2ee8ae07b41..8bdf6647993e 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/stats/ColStatsProcessor.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/ColStatsProcessor.java @@ -25,7 +25,9 @@ import java.util.List; import java.util.stream.Collectors; +import org.apache.commons.collections.CollectionUtils; import org.apache.hadoop.hive.common.HiveStatsUtils; +import org.apache.hadoop.hive.common.StatsSetupConst; import org.apache.hadoop.hive.common.ValidWriteIdList; import org.apache.hadoop.hive.conf.Constants; import org.apache.hadoop.hive.conf.HiveConf; @@ -34,6 +36,7 @@ import org.apache.hadoop.hive.metastore.api.ColumnStatistics; import org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc; import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; +import org.apache.hadoop.hive.metastore.api.EnvironmentContext; import org.apache.hadoop.hive.metastore.api.FieldSchema; import org.apache.hadoop.hive.metastore.api.MetaException; import org.apache.hadoop.hive.metastore.api.SetPartitionsStatsRequest; @@ -60,7 +63,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; - public class ColStatsProcessor implements IStatsProcessor { private static transient final Logger LOG = LoggerFactory.getLogger(ColStatsProcessor.class); @@ -219,8 +221,12 @@ public int persistColumnStats(Hive db, Table tbl) throws HiveException, MetaExce start = System. currentTimeMillis(); if (tbl != null && tbl.isNonNative() && tbl.getStorageHandler().canSetColStatistics(tbl)) { - tbl.getStorageHandler().setColStatistics(tbl, colStats); + boolean success = tbl.getStorageHandler().setColStatistics(tbl, colStats); + if (!(tbl.isMaterializedView() || tbl.isView() || tbl.isTemporary())) { + setOrRemoveColumnStatsAccurateProperty(db, tbl, colStatDesc.getColName(), success); + } } + // TODO: Write stats for native tables only (See HIVE-27421) db.setPartitionColumnStatistics(request); end = System.currentTimeMillis(); LOG.info("Time taken to update " + colStats.size() + " stats : " + ((end - start)/1000F) + " seconds."); @@ -232,6 +238,20 @@ public int persistColumnStats(Hive db, Table tbl) throws HiveException, MetaExce public void setDpPartSpecs(Collection dpPartSpecs) { } + private void setOrRemoveColumnStatsAccurateProperty(Hive db, Table tbl, List colNames, boolean success) throws HiveException { + if (CollectionUtils.isEmpty(colNames) || !colStatDesc.isTblLevel()) { + return; + } + EnvironmentContext environmentContext = new EnvironmentContext(); + environmentContext.putToProperties(StatsSetupConst.DO_NOT_UPDATE_STATS, StatsSetupConst.TRUE); + if (success) { + StatsSetupConst.setColumnStatsState(tbl.getParameters(), colNames); + } else { + StatsSetupConst.removeColumnStatsState(tbl.getParameters(), colNames); + } + db.alterTable(tbl.getFullyQualifiedName(), tbl, environmentContext, false); + } + /** * Enumeration of column stats fields that can currently * be computed. Each one has a field name associated.