Skip to content

HIVE-28910: Remove redundant IS NOT NULL predicates when expanding SEARCH #5795

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Jun 27, 2025
Merged
Original file line number Diff line number Diff line change
@@ -164,13 +164,13 @@ Stage-0
Select Operator [SEL_52] (rows=387 width=178)
Output:["_col0","_col1"]
Filter Operator [FIL_51] (rows=387 width=178)
predicate:(((key < '0') or ((key > '0') and (key < '100')) or (key > '100')) and key is not null and value is not null)
predicate:(((key < '0') or ((key > '0') and (key < '100')) or (key > '100')) and value is not null)
TableScan [TS_3] (rows=500 width=178)
default@src,b,Tbl:COMPLETE,Col:COMPLETE,Output:["key","value"]
<-Select Operator [SEL_55] (rows=387 width=269)
Output:["_col0","_col1","_col2"]
Filter Operator [FIL_54] (rows=387 width=269)
predicate:(((key1 < '0') or ((key1 > '0') and (key1 < '100')) or (key1 > '100')) and key1 is not null and key2 is not null)
predicate:(((key1 < '0') or ((key1 > '0') and (key1 < '100')) or (key1 > '100')) and key2 is not null)
TableScan [TS_0] (rows=500 width=269)
default@srcbucket_big,a,Tbl:COMPLETE,Col:COMPLETE,Grouping Num Buckets:8,Grouping Partition Columns:["key1","key2"],Output:["key1","key2","value"]

@@ -360,13 +360,13 @@ Stage-0
Select Operator [SEL_32] (rows=387 width=178)
Output:["_col0","_col1"]
Filter Operator [FIL_31] (rows=387 width=178)
predicate:(((key < '0') or ((key > '0') and (key < '100')) or (key > '100')) and key is not null)
predicate:((key < '0') or (key > '100') or ((key > '0') and (key < '100')))
TableScan [TS_3] (rows=500 width=178)
default@src,b,Tbl:COMPLETE,Col:COMPLETE,Output:["key","value"]
<-Select Operator [SEL_35] (rows=387 width=269)
Output:["_col0","_col1","_col2"]
Filter Operator [FIL_34] (rows=387 width=269)
predicate:(((key1 < '0') or ((key1 > '0') and (key1 < '100')) or (key1 > '100')) and key1 is not null)
predicate:((key1 < '0') or (key1 > '100') or ((key1 > '0') and (key1 < '100')))
TableScan [TS_0] (rows=500 width=269)
default@srcbucket_big,a,Tbl:COMPLETE,Col:COMPLETE,Grouping Num Buckets:4,Grouping Partition Columns:["key1"],Output:["key1","key2","value"]

Original file line number Diff line number Diff line change
@@ -359,11 +359,11 @@ STAGE PLANS:
Map Operator Tree:
TableScan
alias: o
filterExpr: ((((quantity > 0) and (quantity < 39)) or ((quantity > 39) and (quantity < 69)) or (quantity > 70)) and quantity is not null and itemid is not null) (type: boolean)
filterExpr: ((((quantity > 0) and (quantity < 39)) or ((quantity > 39) and (quantity < 69)) or (quantity > 70)) and itemid is not null) (type: boolean)
probeDecodeDetails: cacheKey:HASH_MAP_MAPJOIN_29_container, bigKeyColName:itemid, smallTablePos:1, keyRatio:0.9523809523809523
Statistics: Num rows: 21 Data size: 168 Basic stats: COMPLETE Column stats: COMPLETE
Filter Operator
predicate: ((((quantity > 0) and (quantity < 39)) or ((quantity > 39) and (quantity < 69)) or (quantity > 70)) and quantity is not null and itemid is not null) (type: boolean)
predicate: ((((quantity > 0) and (quantity < 39)) or ((quantity > 39) and (quantity < 69)) or (quantity > 70)) and itemid is not null) (type: boolean)
Statistics: Num rows: 21 Data size: 168 Basic stats: COMPLETE Column stats: COMPLETE
Select Operator
expressions: quantity (type: int), itemid (type: int)
Original file line number Diff line number Diff line change
@@ -58,14 +58,16 @@ public class SearchTransformer<C extends Comparable<C>> {
private final RexBuilder rexBuilder;
private final RexNode ref;
private final Sarg<C> sarg;
private final RexUnknownAs unknownContext;
protected final RelDataType type;

public SearchTransformer(RexBuilder rexBuilder, RexCall search) {
public SearchTransformer(RexBuilder rexBuilder, RexCall search, final RexUnknownAs unknownContext) {
this.rexBuilder = rexBuilder;
ref = search.getOperands().get(0);
RexLiteral literal = (RexLiteral) search.operands.get(1);
sarg = Objects.requireNonNull(literal.getValueAs(Sarg.class), "Sarg");
type = literal.getType();
this.unknownContext = unknownContext;
}

public RexNode transform() {
@@ -76,7 +78,7 @@ public RexNode transform() {
RangeSets.forEach(sarg.rangeSet, consumer);

List<RexNode> orList = new ArrayList<>();
if (sarg.nullAs == RexUnknownAs.TRUE) {
if (sarg.nullAs == RexUnknownAs.TRUE && unknownContext != RexUnknownAs.TRUE) {
orList.add(rexBuilder.makeCall(SqlStdOperatorTable.IS_NULL, ref));
}
switch (consumer.inLiterals.size()) {
@@ -94,7 +96,7 @@ public RexNode transform() {
orList.addAll(consumer.nodes);
RexNode x = RexUtil.composeDisjunction(rexBuilder, orList);

if (sarg.nullAs == RexUnknownAs.FALSE) {
if (sarg.nullAs == RexUnknownAs.FALSE && unknownContext != RexUnknownAs.FALSE) {
RexNode notNull = rexBuilder.makeCall(SqlStdOperatorTable.IS_NOT_NULL, ref);
x = RexUtil.composeConjunction(rexBuilder, Arrays.asList(notNull, x));
}
@@ -104,9 +106,11 @@ public RexNode transform() {

public static class Shuttle extends RexShuttle {
private final RexBuilder rexBuilder;
private RexUnknownAs unknownContext;

public Shuttle(final RexBuilder rexBuilder) {
public Shuttle(RexBuilder rexBuilder, RexUnknownAs unknownContext) {
this.rexBuilder = rexBuilder;
this.unknownContext = unknownContext;
}

@Override public RexNode visitCall(RexCall call) {
@@ -129,9 +133,21 @@ public Shuttle(final RexBuilder rexBuilder) {
return call;
}
case SEARCH:
return new SearchTransformer<>(rexBuilder, call).transform();
return new SearchTransformer<>(rexBuilder, call, this.unknownContext).transform();
default:
return super.visitCall(call);
// Some calls (e.g., IS [NOT] NULL, COALESCE) are sensitive to changes in the 3-valued
// logic (notably nulls). In such cases, the nullability of the operands is important,
// and it is unsafe to make simplifications based on the general unknown context.
// Instead of adding extra handlers for each special case we pick a more conservative
// approach and use the RexUnknownAs.UNKNOWN context for anything that is not a simple
// conjunction, disjunction, and SEARCH.
// Switch the unknown context, to preserve unknown/null semantics below this call.
RexUnknownAs previousContext = this.unknownContext;
this.unknownContext = RexUnknownAs.UNKNOWN;
RexNode newCall = super.visitCall(call);
// Restore the original context once we finish with the call operands.
this.unknownContext = previousContext;
return newCall;
}
}
}
Original file line number Diff line number Diff line change
@@ -29,6 +29,7 @@
import org.apache.calcite.rex.RexCall;
import org.apache.calcite.rex.RexNode;
import org.apache.calcite.rex.RexShuttle;
import org.apache.calcite.rex.RexUnknownAs;
import org.apache.calcite.rex.RexUtil;
import org.apache.calcite.sql.fun.SqlStdOperatorTable;
import org.apache.hadoop.hive.ql.optimizer.calcite.HiveRelFactories;
@@ -152,7 +153,7 @@ private RexInBetweenExpander(RexBuilder rexBuilder) {
public RexNode visitCall(final RexCall call) {
switch (call.getKind()) {
case SEARCH: {
return new SearchTransformer<>(rexBuilder, call).transform().accept(this);
return new SearchTransformer<>(rexBuilder, call, RexUnknownAs.UNKNOWN).transform().accept(this);
}
case AND: {
boolean[] update = {false};
Original file line number Diff line number Diff line change
@@ -23,6 +23,9 @@
import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveJoin;
import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveProject;

import static org.apache.calcite.rex.RexUnknownAs.FALSE;
import static org.apache.calcite.rex.RexUnknownAs.UNKNOWN;

/**
* A holder class for rules related to the SEARCH operator.
*/
@@ -32,17 +35,17 @@ private HiveSearchRules() {
}

public static final RelOptRule PROJECT_SEARCH_EXPAND =
new HiveRexShuttleTransformRule.Config().withRexShuttle(SearchTransformer.Shuttle::new)
new HiveRexShuttleTransformRule.Config().withRexShuttle(x -> new SearchTransformer.Shuttle(x, UNKNOWN))
.withDescription("HiveProjectSearchExpandRule")
.withOperandSupplier(o -> o.operand(HiveProject.class).anyInputs())
.toRule();
public static final RelOptRule FILTER_SEARCH_EXPAND =
new HiveRexShuttleTransformRule.Config().withRexShuttle(SearchTransformer.Shuttle::new)
new HiveRexShuttleTransformRule.Config().withRexShuttle(x -> new SearchTransformer.Shuttle(x, FALSE))
.withDescription("HiveFilterSearchExpandRule")
.withOperandSupplier(o -> o.operand(HiveFilter.class).anyInputs())
.toRule();
public static final RelOptRule JOIN_SEARCH_EXPAND =
new HiveRexShuttleTransformRule.Config().withRexShuttle(SearchTransformer.Shuttle::new)
new HiveRexShuttleTransformRule.Config().withRexShuttle(x -> new SearchTransformer.Shuttle(x, FALSE))
.withDescription("HiveJoinSearchExpandRule")
.withOperandSupplier(o -> o.operand(HiveJoin.class).anyInputs())
.toRule();
Original file line number Diff line number Diff line change
@@ -35,6 +35,7 @@
import org.apache.calcite.rex.RexInputRef;
import org.apache.calcite.rex.RexLiteral;
import org.apache.calcite.rex.RexNode;
import org.apache.calcite.rex.RexUnknownAs;
import org.apache.calcite.rex.RexUtil;
import org.apache.calcite.rex.RexVisitorImpl;
import org.apache.calcite.sql.SqlKind;
@@ -107,7 +108,7 @@ public Double visitCall(RexCall call) {
break;
}
case SEARCH:
return new SearchTransformer<>(rexBuilder, call).transform().accept(this);
return new SearchTransformer<>(rexBuilder, call, RexUnknownAs.FALSE).transform().accept(this);
case OR: {
selectivity = computeDisjunctionSelectivity(call);
break;
Original file line number Diff line number Diff line change
@@ -37,6 +37,7 @@
import org.apache.calcite.rex.RexLiteral;
import org.apache.calcite.rex.RexNode;
import org.apache.calcite.rex.RexOver;
import org.apache.calcite.rex.RexUnknownAs;
import org.apache.calcite.rex.RexUtil;
import org.apache.calcite.rex.RexVisitorImpl;
import org.apache.calcite.rex.RexWindow;
@@ -200,7 +201,7 @@ public ExprNodeDesc visitCall(RexCall call) {
args.add(operand.accept(this));
}
} else if (call.getKind() == SqlKind.SEARCH) {
return new SearchTransformer<>(rexBuilder, call).transform().accept(this);
return new SearchTransformer<>(rexBuilder, call, RexUnknownAs.UNKNOWN).transform().accept(this);
} else {
for (RexNode operand : call.operands) {
args.add(operand.accept(this));
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.optimizer.calcite;

import com.google.common.collect.ImmutableRangeSet;
import com.google.common.collect.Range;
import com.google.common.collect.RangeSet;

import org.apache.calcite.jdbc.JavaTypeFactoryImpl;
import org.apache.calcite.rel.type.RelDataType;
import org.apache.calcite.rex.RexBuilder;
import org.apache.calcite.rex.RexLiteral;
import org.apache.calcite.rex.RexNode;
import org.apache.calcite.rex.RexUnknownAs;
import org.apache.calcite.sql.type.SqlTypeName;
import org.apache.calcite.util.Sarg;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.Parameterized;

import java.math.BigDecimal;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;

import static org.apache.calcite.rex.RexUnknownAs.FALSE;
import static org.apache.calcite.rex.RexUnknownAs.TRUE;
import static org.apache.calcite.rex.RexUnknownAs.UNKNOWN;
import static org.apache.calcite.rex.RexUnknownAs.values;
import static org.apache.calcite.sql.fun.SqlStdOperatorTable.COALESCE;
import static org.apache.calcite.sql.fun.SqlStdOperatorTable.SEARCH;
import static org.junit.Assert.assertEquals;

@RunWith(Parameterized.class)
public class TestSearchTransformerShuttle {

private final RexNode node;
private final RexUnknownAs shuttleContext;
private final String expected;

public TestSearchTransformerShuttle(RexNode node, RexUnknownAs shuttleContext, String expected) {
this.node = node;
this.shuttleContext = shuttleContext;
this.expected = expected;
}

@Test
public void testAccept() {
RexBuilder rexBuilder = new RexBuilder(new JavaTypeFactoryImpl(new HiveTypeSystemImpl()));
SearchTransformer.Shuttle shuttle = new SearchTransformer.Shuttle(rexBuilder, shuttleContext);
assertEquals(expected, node.accept(shuttle).toString());
}

@Parameterized.Parameters(name = "expression={0}, shuttleCtx={1}")
public static Collection<Object[]> generateExpressions() {
RexBuilder rexBuilder = new RexBuilder(new JavaTypeFactoryImpl(new HiveTypeSystemImpl()));
List<Object[]> expressions = new ArrayList<>();
RexLiteral rexTrue = rexBuilder.makeLiteral(true);
RexLiteral rexFalse = rexBuilder.makeLiteral(false);

final RexNode searchUnknown = createSearchNode(rexBuilder, 10, 20, UNKNOWN);
final RexNode searchFalse = createSearchNode(rexBuilder, 10, 20, FALSE);
final RexNode searchTrue = createSearchNode(rexBuilder, 10, 20, TRUE);
for (RexUnknownAs nullAs : values()) {
// COALESCE is a special function so it's unsafe to drop IS [NOT] NULL, no matter the context
expressions.add(new Object[] { rexBuilder.makeCall(COALESCE, searchFalse, rexTrue), nullAs,
"COALESCE(AND(IS NOT NULL($0), BETWEEN(false, $0, 10, 20)), true)" });
expressions.add(new Object[] { rexBuilder.makeCall(COALESCE, searchTrue, rexFalse), nullAs,
"COALESCE(OR(IS NULL($0), BETWEEN(false, $0, 10, 20)), false)" });
expressions.add(new Object[] { rexBuilder.makeCall(COALESCE, searchUnknown, rexFalse), nullAs,
"COALESCE(BETWEEN(false, $0, 10, 20), false)" });
// Search with null as unknown does not generate extra IS [NOT] NULL predicates
expressions.add(new Object[] { searchUnknown, nullAs, "BETWEEN(false, $0, 10, 20)" });
}

// In FALSE/FALSE and TRUE/TRUE cases, the IS [NOT] NULL predicates are dropped
expressions.add(new Object[] { searchFalse, FALSE, "BETWEEN(false, $0, 10, 20)" });
expressions.add(new Object[] { searchTrue, TRUE, "BETWEEN(false, $0, 10, 20)" });

expressions.add(new Object[] { searchTrue, FALSE, "OR(IS NULL($0), BETWEEN(false, $0, 10, 20))" });
expressions.add(new Object[] { searchTrue, UNKNOWN, "OR(IS NULL($0), BETWEEN(false, $0, 10, 20))" });

expressions.add(new Object[] { searchFalse, TRUE, "AND(IS NOT NULL($0), BETWEEN(false, $0, 10, 20))" });
expressions.add(new Object[] { searchFalse, UNKNOWN, "AND(IS NOT NULL($0), BETWEEN(false, $0, 10, 20))" });
return expressions;
}

private static RexNode createSearchNode(RexBuilder builder, int lower, int upper, RexUnknownAs nullAs) {
final RelDataType intType = builder.getTypeFactory()
.createTypeWithNullability(builder.getTypeFactory().createSqlType(SqlTypeName.INTEGER), true);
RangeSet<BigDecimal> rangeSet =
ImmutableRangeSet.of(Range.closed(BigDecimal.valueOf(lower), BigDecimal.valueOf(upper)));
Sarg sarg = Sarg.of(nullAs, rangeSet);
return builder.makeCall(SEARCH, builder.makeInputRef(intType, 0), builder.makeSearchArgumentLiteral(sarg, intType));
}

}

Original file line number Diff line number Diff line change
@@ -95,10 +95,10 @@ STAGE PLANS:
Map Operator Tree:
TableScan
alias: b
filterExpr: ((key) IN (0, 5) and key is not null) (type: boolean)
filterExpr: (key) IN (0, 5) (type: boolean)
Statistics: Num rows: 84 Data size: 7896 Basic stats: COMPLETE Column stats: COMPLETE
Filter Operator
predicate: ((key) IN (0, 5) and key is not null) (type: boolean)
predicate: (key) IN (0, 5) (type: boolean)
Statistics: Num rows: 3 Data size: 282 Basic stats: COMPLETE Column stats: COMPLETE
Select Operator
expressions: key (type: int), value (type: string)
@@ -108,10 +108,10 @@ STAGE PLANS:
Map Operator Tree:
TableScan
alias: a
filterExpr: ((key) IN (0, 5) and key is not null) (type: boolean)
filterExpr: (key) IN (0, 5) (type: boolean)
Statistics: Num rows: 10 Data size: 930 Basic stats: COMPLETE Column stats: COMPLETE
Filter Operator
predicate: ((key) IN (0, 5) and key is not null) (type: boolean)
predicate: (key) IN (0, 5) (type: boolean)
Statistics: Num rows: 3 Data size: 279 Basic stats: COMPLETE Column stats: COMPLETE
Select Operator
expressions: key (type: int), value (type: string)
@@ -323,10 +323,10 @@ STAGE PLANS:
Map Operator Tree:
TableScan
alias: test_table2_n19
filterExpr: ((key) IN (0, 5) and key is not null) (type: boolean)
filterExpr: (key) IN (0, 5) (type: boolean)
Statistics: Num rows: 84 Data size: 7896 Basic stats: COMPLETE Column stats: COMPLETE
Filter Operator
predicate: ((key) IN (0, 5) and key is not null) (type: boolean)
predicate: (key) IN (0, 5) (type: boolean)
Statistics: Num rows: 3 Data size: 282 Basic stats: COMPLETE Column stats: COMPLETE
Select Operator
expressions: key (type: int), value (type: string)
@@ -336,10 +336,10 @@ STAGE PLANS:
Map Operator Tree:
TableScan
alias: test_table1_n20
filterExpr: ((key) IN (0, 5) and key is not null) (type: boolean)
filterExpr: (key) IN (0, 5) (type: boolean)
Statistics: Num rows: 10 Data size: 930 Basic stats: COMPLETE Column stats: COMPLETE
Filter Operator
predicate: ((key) IN (0, 5) and key is not null) (type: boolean)
predicate: (key) IN (0, 5) (type: boolean)
Statistics: Num rows: 3 Data size: 279 Basic stats: COMPLETE Column stats: COMPLETE
Select Operator
expressions: key (type: int), value (type: string)
Original file line number Diff line number Diff line change
@@ -67,6 +67,6 @@ HiveProject(month=[$0], con_usd=[$2])
HiveFilter(condition=[=($0, 202110)])
HiveTableScan(table=[[default, test2]], table:alias=[test2])
HiveProject(mth=[$0], con_usd=[$1])
HiveFilter(condition=[AND(IS NOT NULL($0), IN($0, 202110, 202503))])
HiveFilter(condition=[IN($0, 202110, 202503)])
HiveTableScan(table=[[default, test3]], table:alias=[d])

Loading
Oops, something went wrong.
Loading
Oops, something went wrong.