Skip to content

Commit

Permalink
Procedure for dropping statistics
Browse files Browse the repository at this point in the history
  • Loading branch information
aalbu authored and findepi committed Jan 20, 2020
1 parent b32d98d commit bfef189
Show file tree
Hide file tree
Showing 4 changed files with 503 additions and 1 deletion.
19 changes: 19 additions & 0 deletions presto-docs/src/main/sphinx/connector/hive.rst
Expand Up @@ -630,6 +630,11 @@ specify a subset of columns to be analyzed via the optional ``columns`` property
This query collects statistics for columns ``col_1`` and ``col_2`` for the partition
with keys ``p2_value1, p2_value2``.

Note that if statistics were previously collected for all columns, they need to be dropped
before re-analyzing just a subset::

CALL system.drop_stats(schema_name, table_name, ARRAY[ARRAY['p2_value1', 'p2_value2']])

Schema Evolution
----------------

Expand Down Expand Up @@ -722,6 +727,13 @@ Procedures
* ``DROP``: drop any partitions that exist in the metastore, but not on the file system.
* ``FULL``: perform both ``ADD`` and ``DROP``.

* ``system.drop_stats(schema_name, table_name, partition_values)``

Drops statistics for a subset of partitions or the entire table. The partitions are specified as an
array whose elements are arrays of partition values (similar to the ``partition_values`` argument in
``create_empty_partition``). A null value for the ``partition_values`` argument indicates that stats
should be dropped for the entire table.

Examples
--------

Expand Down Expand Up @@ -768,6 +780,13 @@ Add an empty partition to the ``page_views`` table::
partition_columns => ARRAY['ds', 'country'],
partition_values => ARRAY['2016-08-09', 'US']);

Drop stats for a partition of the ``page_views`` table::

CALL system.drop_stats(
schema_name => 'web',
table_name => 'page_views',
partition_values => ARRAY['2016-08-09', 'US']);

Query the ``page_views`` table::

SELECT * FROM hive.web.page_views
Expand Down
@@ -0,0 +1,174 @@
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.prestosql.plugin.hive;

import com.google.common.collect.ImmutableList;
import io.prestosql.plugin.hive.authentication.HiveIdentity;
import io.prestosql.plugin.hive.metastore.HiveMetastore;
import io.prestosql.spi.PrestoException;
import io.prestosql.spi.classloader.ThreadContextClassLoader;
import io.prestosql.spi.connector.ColumnHandle;
import io.prestosql.spi.connector.ConnectorSession;
import io.prestosql.spi.connector.SchemaTableName;
import io.prestosql.spi.procedure.Procedure;
import io.prestosql.spi.procedure.Procedure.Argument;
import io.prestosql.spi.type.ArrayType;

import javax.inject.Inject;
import javax.inject.Provider;

import java.lang.invoke.MethodHandle;
import java.util.List;
import java.util.Map;
import java.util.function.Supplier;

import static com.google.common.collect.ImmutableList.toImmutableList;
import static io.prestosql.spi.StandardErrorCode.INVALID_PROCEDURE_ARGUMENT;
import static io.prestosql.spi.block.MethodHandleUtil.methodHandle;
import static io.prestosql.spi.type.VarcharType.VARCHAR;
import static java.lang.String.format;
import static java.util.Objects.requireNonNull;
import static org.apache.hadoop.hive.metastore.utils.FileUtils.makePartName;

/**
* A procedure that drops statistics. It can be invoked for a subset of partitions (e.g.
* {@code CALL system.drop_stats('system', 'some_table', ARRAY[ARRAY['x', '7']])}) or
* for the entire table ({@code CALL system.drop_stats('system', 'some_table', NULL)})).
*/
public class DropStatsProcedure
implements Provider<Procedure>
{
private static final MethodHandle DROP_STATS = methodHandle(
DropStatsProcedure.class,
"dropStats",
ConnectorSession.class,
String.class,
String.class,
List.class);

private final Supplier<TransactionalMetadata> hiveMetadataFactory;
private final HiveMetastore metastore;

@Inject
public DropStatsProcedure(Supplier<TransactionalMetadata> hiveMetadataFactory, HiveMetastore metastore)
{
this.hiveMetadataFactory = requireNonNull(hiveMetadataFactory, "hiveMetadataFactory is null");
this.metastore = requireNonNull(metastore, "metastore is null");
}

@Override
public Procedure get()
{
return new Procedure(
"system",
"drop_stats",
ImmutableList.of(
new Argument("schema_name", VARCHAR),
new Argument("table_name", VARCHAR),
new Argument("partition_values", new ArrayType(new ArrayType(VARCHAR)))),
DROP_STATS.bindTo(this));
}

public void dropStats(ConnectorSession session, String schema, String table, List<?> partitionValues)
{
try (ThreadContextClassLoader ignored = new ThreadContextClassLoader(getClass().getClassLoader())) {
doDropStats(session, schema, table, partitionValues);
}
}

private void doDropStats(ConnectorSession session, String schema, String table, List<?> partitionValues)
{
TransactionalMetadata hiveMetadata = hiveMetadataFactory.get();
HiveTableHandle handle = (HiveTableHandle) hiveMetadata.getTableHandle(session, new SchemaTableName(schema, table));
Map<String, ColumnHandle> columns = hiveMetadata.getColumnHandles(session, handle);
List<String> partitionColumns = columns.values().stream()
.map(HiveColumnHandle.class::cast)
.filter(HiveColumnHandle::isPartitionKey)
.map(HiveColumnHandle::getName)
.collect(toImmutableList());

if (partitionValues != null) {
// drop stats for specified partitions
List<List<String>> partitionStringValues = partitionValues.stream()
.map(DropStatsProcedure::validateParameterType)
.collect(toImmutableList());
validatePartitions(partitionStringValues, partitionColumns);

partitionStringValues.forEach(values -> metastore.updatePartitionStatistics(
new HiveIdentity(session.getIdentity()),
schema,
table,
makePartName(partitionColumns, values),
stats -> PartitionStatistics.empty()));
}
else {
// no partition specified, so drop stats for the entire table
if (partitionColumns.isEmpty()) {
// for non-partitioned tables, just wipe table stats
metastore.updateTableStatistics(
new HiveIdentity(session.getIdentity()),
schema,
table,
stats -> PartitionStatistics.empty());
}
else {
// the table is partitioned; remove stats for every partition
metastore.getPartitionNames(new HiveIdentity(session.getIdentity()), handle.getSchemaName(), handle.getTableName())
.ifPresent(partitions -> partitions.forEach(partitionName -> metastore.updatePartitionStatistics(
new HiveIdentity(session.getIdentity()),
schema,
table,
partitionName,
stats -> PartitionStatistics.empty())));
}
}

hiveMetadata.commit();
}

private static List<String> validateParameterType(Object param)
{
if (param == null) {
throw new PrestoException(INVALID_PROCEDURE_ARGUMENT, "Null partition value");
}

if (param instanceof List) {
return ((List<?>) param)
.stream()
.map(String.class::cast)
.collect(toImmutableList());
}

throw new PrestoException(INVALID_PROCEDURE_ARGUMENT, "Partition value must be an array");
}

private static void validatePartitions(List<List<String>> partitionValues, List<String> partitionColumns)
{
if (partitionValues.isEmpty()) {
throw new PrestoException(INVALID_PROCEDURE_ARGUMENT, "No partitions provided");
}

if (partitionColumns.isEmpty()) {
throw new PrestoException(INVALID_PROCEDURE_ARGUMENT, "Cannot specify partition values for an unpartitioned table");
}

partitionValues.forEach(value -> {
if (value.size() != partitionColumns.size()) {
throw new PrestoException(
INVALID_PROCEDURE_ARGUMENT,
format("Partition values %s don't match the number of partition columns (%s)", value, partitionColumns.size()));
}
});
}
}
Expand Up @@ -30,5 +30,6 @@ public void configure(Binder binder)
Multibinder<Procedure> procedures = newSetBinder(binder, Procedure.class);
procedures.addBinding().toProvider(CreateEmptyPartitionProcedure.class).in(Scopes.SINGLETON);
procedures.addBinding().toProvider(SyncPartitionMetadataProcedure.class).in(Scopes.SINGLETON);
procedures.addBinding().toProvider(DropStatsProcedure.class).in(Scopes.SINGLETON);
}
}

0 comments on commit bfef189

Please sign in to comment.