ql/src/java/org/apache/hadoop/hive/ql/exec/GroupByOperator.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.ql.exec;

import java.lang.management.ManagementFactory;
import java.lang.management.MemoryMXBean;
import java.lang.reflect.Field;
import java.sql.Timestamp;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.common.type.TimestampTZ;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.llap.LlapDaemonInfo;
import org.apache.hadoop.hive.ql.CompilationOpContext;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.parse.OpParseContext;
import org.apache.hadoop.hive.ql.plan.AggregationDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeDescUtils;
import org.apache.hadoop.hive.ql.plan.GroupByDesc;
import org.apache.hadoop.hive.ql.plan.GroupByDesc.Mode;
import org.apache.hadoop.hive.ql.plan.OperatorDesc;
import org.apache.hadoop.hive.ql.plan.api.OperatorType;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator.AggregationBuffer;
import org.apache.hadoop.hive.serde2.lazy.ByteArrayRef;
import org.apache.hadoop.hive.serde2.lazy.LazyBinary;
import org.apache.hadoop.hive.serde2.lazy.LazyPrimitive;
import org.apache.hadoop.hive.serde2.lazy.LazyString;
import org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyBinaryObjectInspector;
import org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyStringObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.ObjectInspectorCopyOption;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.UnionObject;
import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;

import javolution.util.FastBitSet;

/**
 * GroupBy operator implementation.
 */
public class GroupByOperator extends Operator<GroupByDesc> implements IConfigureJobConf {

  private static final long serialVersionUID = 1L;
  private static final int NUMROWSESTIMATESIZE = 1000;

  private transient ExprNodeEvaluator[] keyFields;
  private transient ObjectInspector[] keyObjectInspectors;

  private transient ExprNodeEvaluator[][] aggregationParameterFields;
  private transient ObjectInspector[][] aggregationParameterObjectInspectors;
  private transient ObjectInspector[][] aggregationParameterStandardObjectInspectors;
  private transient Object[][] aggregationParameterObjects;

  // so aggregationIsDistinct is a boolean array instead of a single number.
  private transient boolean[] aggregationIsDistinct;
  // Map from integer tag to distinct aggrs
  private transient Map<Integer, Set<Integer>> distinctKeyAggrs = new HashMap<>();
  // Map from integer tag to non-distinct aggrs with key parameters.
  private transient Map<Integer, Set<Integer>> nonDistinctKeyAggrs = new HashMap<>();
  // List of non-distinct aggrs.
  private transient Set<Integer> nonDistinctAggrs = new HashSet<>();
  // Union expr for distinct keys
  private transient ExprNodeEvaluator unionExprEval;

  private transient GenericUDAFEvaluator[] aggregationEvaluators;
  private transient boolean[] estimableAggregationEvaluators;

  // Used by sort-based GroupBy: Mode = COMPLETE, PARTIAL1, PARTIAL2,
  // MERGEPARTIAL
  private transient KeyWrapper currentKeys;
  private transient KeyWrapper newKeys;
  private transient AggregationBuffer[] aggregations;
  private transient Object[][] aggregationsParametersLastInvoke;

  // Used by hash-based GroupBy: Mode = HASH, PARTIALS
  private transient HashMap<KeyWrapper, AggregationBuffer[]> hashAggregations;

  private transient boolean firstRow;
  private transient boolean hashAggr;
  private transient long numRowsInput;
  private transient long numRowsHashTbl;
  private transient int groupbyMapAggrInterval;
  private transient long numRowsCompareHashAggr;
  private transient float minReductionHashAggr;

  private transient int outputKeyLength;

  // current Key ObjectInspectors are standard ObjectInspectors
  private transient ObjectInspector[] currentKeyObjectInspectors;

  private transient MemoryMXBean memoryMXBean;

  private transient boolean groupingSetsPresent;      // generates grouping set
  private transient int groupingSetsPosition;         // position of grouping set, generally the last of keys
  private transient List<Long> groupingSets;       // declared grouping set values
  private transient FastBitSet[] groupingSetsBitSet;  // bitsets acquired from grouping set values
  private transient LongWritable[] newKeysGroupingSets;

  // for these positions, some variable primitive type (String) is used, so size
  // cannot be estimated. sample it at runtime.
  private transient List<Integer> keyPositionsSize;

  // for these positions, some variable primitive type (String) is used for the
  // aggregation classes
  private transient List<Field>[] aggrPositions;

  private transient int fixedRowSize;

  private transient int totalVariableSize;
  private transient int numEntriesVarSize;

  private transient int countAfterReport;   // report or forward
  private transient int heartbeatInterval;

  private transient boolean isTez;
  private transient boolean isLlap;
  private transient int numExecutors;

  /**
   * Total amount of memory allowed for JVM heap.
   */
  protected transient long maxMemory;

  /**
   * Max memory usable by the hashtable before it should flush.
   */
  protected transient long maxHashTblMemory;

  /**
   * configure percent of memory threshold usable by QP.
   */
  protected transient float memoryThreshold;

  /**
   * Current number of entries in the hash table.
   */
  protected transient int numEntriesHashTable;

  /**
   * This method returns the big-endian representation of value.
   * @param value
   * @param length
   * @return
   */
  public static FastBitSet groupingSet2BitSet(long value, int length) {
    FastBitSet bits = new FastBitSet();
    for (int index = length - 1; index >= 0; index--) {
      if (value % 2 != 0) {
        bits.set(index);
      }
      value = value >>> 1;
    }
    return bits;
  }

  /** Kryo ctor. */
  protected GroupByOperator() {
    super();
  }

  public GroupByOperator(CompilationOpContext ctx) {
    super(ctx);
  }

  @Override
  protected void initializeOp(Configuration hconf) throws HiveException {
    super.initializeOp(hconf);
    numRowsInput = 0;
    numRowsHashTbl = 0;

    heartbeatInterval = HiveConf.getIntVar(hconf,
        HiveConf.ConfVars.HIVESENDHEARTBEAT);
    countAfterReport = 0;
    ObjectInspector rowInspector = inputObjInspectors[0];

    // init keyFields
    int numKeys = conf.getKeys().size();

    keyFields = new ExprNodeEvaluator[numKeys];
    keyObjectInspectors = new ObjectInspector[numKeys];
    currentKeyObjectInspectors = new ObjectInspector[numKeys];
    for (int i = 0; i < numKeys; i++) {
      keyFields[i] = ExprNodeEvaluatorFactory.get(conf.getKeys().get(i), hconf);
      keyObjectInspectors[i] = keyFields[i].initialize(rowInspector);
      currentKeyObjectInspectors[i] = ObjectInspectorUtils
        .getStandardObjectInspector(keyObjectInspectors[i],
        ObjectInspectorCopyOption.WRITABLE);
    }

    // Initialize the constants for the grouping sets, so that they can be re-used for
    // each row
    groupingSetsPresent = conf.isGroupingSetsPresent();
    if (groupingSetsPresent) {
      groupingSets = conf.getListGroupingSets();
      groupingSetsPosition = conf.getGroupingSetPosition();
      newKeysGroupingSets = new LongWritable[groupingSets.size()];
      groupingSetsBitSet = new FastBitSet[groupingSets.size()];

      int pos = 0;
      for (Long groupingSet: groupingSets) {
        // Create the mapping corresponding to the grouping set
        newKeysGroupingSets[pos] = new LongWritable(groupingSet);
        groupingSetsBitSet[pos] = groupingSet2BitSet(groupingSet, groupingSetsPosition);
        pos++;
      }
    }

    // initialize unionExpr for reduce-side
    // reduce KEY has union field as the last field if there are distinct
    // aggregates in group-by.
    List<? extends StructField> sfs =
      ((StructObjectInspector) rowInspector).getAllStructFieldRefs();
    if (sfs.size() > 0) {
      StructField keyField = sfs.get(0);
      if (keyField.getFieldName().toUpperCase().equals(
          Utilities.ReduceField.KEY.name())) {
        ObjectInspector keyObjInspector = keyField.getFieldObjectInspector();
        if (keyObjInspector instanceof StructObjectInspector) {
          List<? extends StructField> keysfs =
            ((StructObjectInspector) keyObjInspector).getAllStructFieldRefs();
          if (keysfs.size() > 0) {
            // the last field is the union field, if any
            StructField sf = keysfs.get(keysfs.size() - 1);
            if (sf.getFieldObjectInspector().getCategory().equals(
                ObjectInspector.Category.UNION)) {
              unionExprEval = ExprNodeEvaluatorFactory.get(
                new ExprNodeColumnDesc(TypeInfoUtils.getTypeInfoFromObjectInspector(
                sf.getFieldObjectInspector()),
                keyField.getFieldName() + "." + sf.getFieldName(), null,
                false), hconf);
              unionExprEval.initialize(rowInspector);
            }
          }
        }
      }
    }
    // init aggregationParameterFields
    List<AggregationDesc> aggrs = conf.getAggregators();
    aggregationParameterFields = new ExprNodeEvaluator[aggrs.size()][];
    aggregationParameterObjectInspectors = new ObjectInspector[aggrs.size()][];
    aggregationParameterStandardObjectInspectors = new ObjectInspector[aggrs.size()][];
    aggregationParameterObjects = new Object[aggrs.size()][];
    aggregationIsDistinct = new boolean[aggrs.size()];
    for (int i = 0; i < aggrs.size(); i++) {
      AggregationDesc aggr = aggrs.get(i);
      List<ExprNodeDesc> parameters = aggr.getParameters();
      aggregationParameterFields[i] = new ExprNodeEvaluator[parameters.size()];
      aggregationParameterObjectInspectors[i] = new ObjectInspector[parameters
          .size()];
      aggregationParameterStandardObjectInspectors[i] = new ObjectInspector[parameters
          .size()];
      aggregationParameterObjects[i] = new Object[parameters.size()];
      for (int j = 0; j < parameters.size(); j++) {
        aggregationParameterFields[i][j] = ExprNodeEvaluatorFactory
            .get(parameters.get(j), hconf);
        aggregationParameterObjectInspectors[i][j] = aggregationParameterFields[i][j]
            .initialize(rowInspector);
        if (unionExprEval != null) {
          String[] names = parameters.get(j).getExprString().split("\\.");
          // parameters of the form : KEY.colx:t.coly
          if (Utilities.ReduceField.KEY.name().equals(names[0]) && names.length > 2) {
            String name = names[names.length - 2];
            int tag = Integer.parseInt(name.split("\\:")[1]);
            if (aggr.getDistinct()) {
              // is distinct
              distinctKeyAggrs.computeIfAbsent(tag, t -> new HashSet<>()).add(i);
            } else {
              nonDistinctKeyAggrs.computeIfAbsent(tag, t -> new HashSet<>()).add(i);
            }
          } else {
            // will be KEY._COLx or VALUE._COLx
            nonDistinctAggrs.add(i);
          }
        } else {
          if (aggr.getDistinct()) {
            aggregationIsDistinct[i] = true;
          }
        }
        aggregationParameterStandardObjectInspectors[i][j] = ObjectInspectorUtils
            .getStandardObjectInspector(
            aggregationParameterObjectInspectors[i][j],
            ObjectInspectorCopyOption.WRITABLE);
        aggregationParameterObjects[i][j] = null;
      }
      if (parameters.size() == 0) {
        // for ex: count(*)
        nonDistinctAggrs.add(i);
      }
    }

    // init aggregationClasses
    aggregationEvaluators = new GenericUDAFEvaluator[conf.getAggregators()
        .size()];
    for (int i = 0; i < aggregationEvaluators.length; i++) {
      AggregationDesc agg = conf.getAggregators().get(i);
      aggregationEvaluators[i] = agg.getGenericUDAFEvaluator();
    }

    MapredContext context = MapredContext.get();
    if (context != null) {
      for (GenericUDAFEvaluator genericUDAFEvaluator : aggregationEvaluators) {
        context.setup(genericUDAFEvaluator);
      }
    }

    // grouping id should be pruned, which is the last of key columns
    // see ColumnPrunerGroupByProc
    outputKeyLength = conf.pruneGroupingSetId() ? keyFields.length - 1 : keyFields.length;

    // init objectInspectors
    ObjectInspector[] objectInspectors =
            new ObjectInspector[outputKeyLength + aggregationEvaluators.length];
    for (int i = 0; i < outputKeyLength; i++) {
      objectInspectors[i] = currentKeyObjectInspectors[i];
    }
    for (int i = 0; i < aggregationEvaluators.length; i++) {
      objectInspectors[outputKeyLength + i] = aggregationEvaluators[i].init(conf.getAggregators()
              .get(i).getMode(), aggregationParameterObjectInspectors[i]);
    }

    aggregationsParametersLastInvoke = new Object[conf.getAggregators().size()][];
    if ((conf.getMode() != GroupByDesc.Mode.HASH || conf.getBucketGroup()) &&
      (!groupingSetsPresent)) {
      aggregations = newAggregations();
      hashAggr = false;
    } else {
      hashAggregations = new HashMap<KeyWrapper, AggregationBuffer[]>(256);
      aggregations = newAggregations();
      hashAggr = true;
      keyPositionsSize = new ArrayList<Integer>();
      aggrPositions = new List[aggregations.length];
      groupbyMapAggrInterval = HiveConf.getIntVar(hconf,
          HiveConf.ConfVars.HIVEGROUPBYMAPINTERVAL);

      // compare every groupbyMapAggrInterval rows
      numRowsCompareHashAggr = groupbyMapAggrInterval;
      minReductionHashAggr = conf.getMinReductionHashAggr();
    }

    List<String> fieldNames = new ArrayList<String>(conf.getOutputColumnNames());

    outputObjInspector = ObjectInspectorFactory
        .getStandardStructObjectInspector(fieldNames, Arrays.asList(objectInspectors));

    KeyWrapperFactory keyWrapperFactory =
      new KeyWrapperFactory(keyFields, keyObjectInspectors, currentKeyObjectInspectors);

    newKeys = keyWrapperFactory.getKeyWrapper();
    isTez = HiveConf.getVar(hconf, HiveConf.ConfVars.HIVE_EXECUTION_ENGINE).equals("tez");
    isLlap = LlapDaemonInfo.INSTANCE.isLlap();
    numExecutors = isLlap ? LlapDaemonInfo.INSTANCE.getNumExecutors() : 1;
    firstRow = true;
    // estimate the number of hash table entries based on the size of each
    // entry. Since the size of a entry
    // is not known, estimate that based on the number of entries
    if (hashAggr) {
      computeMaxEntriesHashAggr();
    }
    memoryMXBean = ManagementFactory.getMemoryMXBean();
    maxMemory = isTez ? getConf().getMaxMemoryAvailable() : memoryMXBean.getHeapMemoryUsage().getMax();
    memoryThreshold = this.getConf().getMemoryThreshold();
    LOG.info("isTez: {} isLlap: {} numExecutors: {} maxMemory: {}", isTez, isLlap, numExecutors, maxMemory);
  }

  /**
   * Estimate the number of entries in map-side hash table. The user can specify
   * the total amount of memory to be used by the map-side hash. By default, all
   * available memory is used. The size of each row is estimated, rather
   * crudely, and the number of entries are figure out based on that.
   *
   * @return number of entries that can fit in hash table - useful for map-side
   *         aggregation only
   **/
  private void computeMaxEntriesHashAggr() throws HiveException {
    float memoryPercentage = this.getConf().getGroupByMemoryUsage();
    if (isTez) {
      maxHashTblMemory = (long) (memoryPercentage * getConf().getMaxMemoryAvailable());
    } else {
      maxHashTblMemory = (long) (memoryPercentage * Runtime.getRuntime().maxMemory());
    }
    LOG.info("Max hash table memory: {} bytes", maxHashTblMemory);
    estimateRowSize();
  }

  public static final int javaObjectOverHead = 64;
  public static final int javaHashEntryOverHead = 64;
  public static final int javaSizePrimitiveType = 16;
  public static final int javaSizeUnknownType = 256;

  /**
   * The size of the element at position 'pos' is returned, if possible. If the
   * datatype is of variable length, STRING, a list of such key positions is
   * maintained, and the size for such positions is then actually calculated at
   * runtime.
   *
   * @param pos
   *          the position of the key
   * @return the size of this datatype
   **/
  private int getSize(int pos, PrimitiveCategory category) {
    switch (category) {
    case VOID:
    case BOOLEAN:
    case BYTE:
    case SHORT:
    case INT:
    case LONG:
    case FLOAT:
    case DOUBLE:
      return javaSizePrimitiveType;
    case STRING:
      keyPositionsSize.add(Integer.valueOf(pos));
      return javaObjectOverHead;
    case BINARY:
      keyPositionsSize.add(Integer.valueOf(pos));
      return javaObjectOverHead;
    case TIMESTAMP:
    case TIMESTAMPLOCALTZ:
      return javaObjectOverHead + javaSizePrimitiveType;
    default:
      return javaSizeUnknownType;
    }
  }

  /**
   * The size of the element at position 'pos' is returned, if possible. If the
   * field is of variable length, STRING, a list of such field names for the
   * field position is maintained, and the size for such positions is then
   * actually calculated at runtime.
   *
   * @param pos
   *          the position of the key
   * @param c
   *          the type of the key
   * @param f
   *          the field to be added
   * @return the size of this datatype
   **/
  private int getSize(int pos, Class<?> c, Field f) {
    if (c.isPrimitive()
        || c.isInstance(Boolean.valueOf(true))
        || c.isInstance(Byte.valueOf((byte) 0))
        || c.isInstance(Short.valueOf((short) 0))
        || c.isInstance(Integer.valueOf(0))
        || c.isInstance(Long.valueOf(0))
        || c.isInstance(Float.valueOf(0)) || c.isInstance(Double.valueOf(0))) {
      return javaSizePrimitiveType;
    }

    if (c.isInstance(new Timestamp(0)) || c.isInstance(new TimestampTZ())) {
      return javaObjectOverHead + javaSizePrimitiveType;
    }

    if (c.isInstance(new String()) || c.isInstance(new ByteArrayRef())) {
      if (aggrPositions[pos] == null) {
        aggrPositions[pos] = new ArrayList<Field>();
      }
      aggrPositions[pos].add(f);
      return javaObjectOverHead;
    }

    return javaSizeUnknownType;
  }

  /**
   * @param pos
   *          position of the key
   * @param typeInfo
   *          type of the input
   * @return the size of this datatype
   **/
  private int getSize(int pos, TypeInfo typeInfo) {
    if (typeInfo instanceof PrimitiveTypeInfo) {
      return getSize(pos, ((PrimitiveTypeInfo) typeInfo).getPrimitiveCategory());
    }
    return javaSizeUnknownType;
  }

  /**
   * @return the size of each row
   **/
  private void estimateRowSize() throws HiveException {
    // estimate the size of each entry -
    // a datatype with unknown size (String/Struct etc. - is assumed to be 256
    // bytes for now).
    // 64 bytes is the overhead for a reference
    fixedRowSize = javaHashEntryOverHead;

    List<ExprNodeDesc> keys = conf.getKeys();

    // Go over all the keys and get the size of the fields of fixed length. Keep
    // track of the variable length keys
    for (int pos = 0; pos < keys.size(); pos++) {
      fixedRowSize += getSize(pos, keys.get(pos).getTypeInfo());
    }

    // Go over all the aggregation classes and and get the size of the fields of
    // fixed length. Keep track of the variable length
    // fields in these aggregation classes.
    estimableAggregationEvaluators = new boolean[aggregationEvaluators.length];
    for (int i = 0; i < aggregationEvaluators.length; i++) {

      fixedRowSize += javaObjectOverHead;
      AggregationBuffer agg = aggregationEvaluators[i].getNewAggregationBuffer();
      if (GenericUDAFEvaluator.isEstimable(agg)) {
        estimableAggregationEvaluators[i] = true;
        continue;
      }
      Field[] fArr = ObjectInspectorUtils.getDeclaredNonStaticFields(agg.getClass());
      for (Field f : fArr) {
        fixedRowSize += getSize(i, f.getType(), f);
      }
    }
  }

  protected AggregationBuffer[] newAggregations() throws HiveException {
    AggregationBuffer[] aggs = new AggregationBuffer[aggregationEvaluators.length];
    for (int i = 0; i < aggregationEvaluators.length; i++) {
      aggs[i] = aggregationEvaluators[i].getNewAggregationBuffer();
      // aggregationClasses[i].reset(aggs[i]);
    }
    return aggs;
  }

  protected void resetAggregations(AggregationBuffer[] aggs) throws HiveException {
    for (int i = 0; i < aggs.length; i++) {
      aggregationEvaluators[i].reset(aggs[i]);
    }
  }

  /*
   * Update aggregations. If the aggregation is for distinct, in case of hash
   * aggregation, the client tells us whether it is a new entry. For sort-based
   * aggregations, the last row is compared with the current one to figure out
   * whether it has changed. As a cleanup, the lastInvoke logic can be pushed in
   * the caller, and this function can be independent of that. The client should
   * always notify whether it is a different row or not.
   *
   * @param aggs the aggregations to be evaluated
   *
   * @param row the row being processed
   *
   * @param rowInspector the inspector for the row
   *
   * @param hashAggr whether hash aggregation is being performed or not
   *
   * @param newEntryForHashAggr only valid if it is a hash aggregation, whether
   * it is a new entry or not
   */
  protected void updateAggregations(AggregationBuffer[] aggs, Object row,
      ObjectInspector rowInspector, boolean hashAggr,
      boolean newEntryForHashAggr, Object[][] lastInvoke) throws HiveException {
    if (unionExprEval == null) {
      for (int ai = 0; ai < aggs.length; ai++) {
        // Calculate the parameters
        Object[] o = new Object[aggregationParameterFields[ai].length];
        for (int pi = 0; pi < aggregationParameterFields[ai].length; pi++) {
          o[pi] = aggregationParameterFields[ai][pi].evaluate(row);
        }

        // Update the aggregations.
        if (aggregationIsDistinct[ai]) {
          if (hashAggr) {
            if (newEntryForHashAggr) {
              aggregationEvaluators[ai].aggregate(aggs[ai], o);
            }
          } else {
            if (lastInvoke[ai] == null) {
              lastInvoke[ai] = new Object[o.length];
            }
            if (ObjectInspectorUtils.compare(o,
                aggregationParameterObjectInspectors[ai], lastInvoke[ai],
                aggregationParameterStandardObjectInspectors[ai]) != 0) {
              aggregationEvaluators[ai].aggregate(aggs[ai], o);
              for (int pi = 0; pi < o.length; pi++) {
                lastInvoke[ai][pi] = ObjectInspectorUtils.copyToStandardObject(
                    o[pi], aggregationParameterObjectInspectors[ai][pi],
                    ObjectInspectorCopyOption.WRITABLE);
              }
            }
          }
        } else {
          aggregationEvaluators[ai].aggregate(aggs[ai], o);
        }
      }
      return;
    }

    if (distinctKeyAggrs.size() > 0) {
      // evaluate union object
      UnionObject uo = (UnionObject) (unionExprEval.evaluate(row));
      int unionTag = uo.getTag();

      // update non-distinct key aggregations : "KEY._colx:t._coly"
      if (nonDistinctKeyAggrs.get(unionTag) != null) {
        for (int pos : nonDistinctKeyAggrs.get(unionTag)) {
          Object[] o = new Object[aggregationParameterFields[pos].length];
          for (int pi = 0; pi < aggregationParameterFields[pos].length; pi++) {
            o[pi] = aggregationParameterFields[pos][pi].evaluate(row);
          }
          aggregationEvaluators[pos].aggregate(aggs[pos], o);
        }
      }
      // there may be multi distinct clauses for one column
      // update them all.
      if (distinctKeyAggrs.get(unionTag) != null) {
        for (int i : distinctKeyAggrs.get(unionTag)) {
          Object[] o = new Object[aggregationParameterFields[i].length];
          for (int pi = 0; pi < aggregationParameterFields[i].length; pi++) {
            o[pi] = aggregationParameterFields[i][pi].evaluate(row);
          }

          if (hashAggr) {
            if (newEntryForHashAggr) {
              aggregationEvaluators[i].aggregate(aggs[i], o);
            }
          } else {
            if (lastInvoke[i] == null) {
              lastInvoke[i] = new Object[o.length];
            }
            if (ObjectInspectorUtils.compare(o,
                aggregationParameterObjectInspectors[i],
                lastInvoke[i],
                aggregationParameterStandardObjectInspectors[i]) != 0) {
              aggregationEvaluators[i].aggregate(aggs[i], o);
              for (int pi = 0; pi < o.length; pi++) {
                lastInvoke[i][pi] = ObjectInspectorUtils.copyToStandardObject(
                    o[pi], aggregationParameterObjectInspectors[i][pi],
                    ObjectInspectorCopyOption.WRITABLE);
              }
            }
          }
        }
      }

      // update non-distinct groupby key or value aggregations: 'KEY._COLx or VALUE._colx'
      // these aggregations should be updated only once.
      if (unionTag == 0) {
        for (int pos : nonDistinctAggrs) {
          Object[] o = new Object[aggregationParameterFields[pos].length];
          for (int pi = 0; pi < aggregationParameterFields[pos].length; pi++) {
            o[pi] = aggregationParameterFields[pos][pi].evaluate(row);
          }
          aggregationEvaluators[pos].aggregate(aggs[pos], o);
        }
      }
    } else {
      for (int ai = 0; ai < aggs.length; ai++) {
        // there is no distinct aggregation,
        // update all aggregations
        Object[] o = new Object[aggregationParameterFields[ai].length];
        for (int pi = 0; pi < aggregationParameterFields[ai].length; pi++) {
          o[pi] = aggregationParameterFields[ai][pi].evaluate(row);
        }
        aggregationEvaluators[ai].aggregate(aggs[ai], o);
      }
    }
  }

  private void processKey(Object row,
      ObjectInspector rowInspector) throws HiveException {
    if (hashAggr) {
      newKeys.setHashKey();
      processHashAggr(row, rowInspector, newKeys);
    } else {
      processAggr(row, rowInspector, newKeys);
    }

    if (countAfterReport != 0 && (countAfterReport % heartbeatInterval) == 0
      && (reporter != null)) {
      reporter.progress();
      countAfterReport = 0;
    }
  }

  @Override
  public void process(Object row, int tag) throws HiveException {
    firstRow = false;
    ObjectInspector rowInspector = inputObjInspectors[0];
    // Total number of input rows is needed for hash aggregation only
    if (hashAggr) {
      numRowsInput++;
      // if hash aggregation is not behaving properly, disable it
      if (numRowsInput == numRowsCompareHashAggr) {
        numRowsCompareHashAggr += groupbyMapAggrInterval;
        long numRowsProcessed = groupingSetsPresent ? numRowsInput * groupingSets.size() : numRowsInput;
        // map-side aggregation should reduce the entries by at-least half
        if (numRowsHashTbl > numRowsProcessed * minReductionHashAggr) {
          LOG.warn("Disable Hash Aggr: #hash table = " + numRowsHashTbl
              + " #numRowsInput = " + numRowsInput + " reduction = " + 1.0 * (numRowsHashTbl / numRowsProcessed)
              + " minReduction = " + minReductionHashAggr + " groupingSetsPresent = " + groupingSetsPresent
              + " numRowsProcessed = " + numRowsProcessed);
          flushHashTable(true);
          hashAggr = false;
        } else {
          if (LOG.isTraceEnabled()) {
            LOG.trace("Hash Aggr Enabled: #hash table = " + numRowsHashTbl + " #numRowsInput = " + numRowsInput
                + " reduction = " + 1.0 * (numRowsHashTbl / numRowsProcessed) + " minReduction = "
                + minReductionHashAggr + " groupingSetsPresent = " + groupingSetsPresent + " numRowsProcessed = "
                + numRowsProcessed);
          }
        }
      }
    }

    try {
      countAfterReport++;
      newKeys.getNewKey(row, rowInspector);

      if (groupingSetsPresent) {
        Object[] newKeysArray = newKeys.getKeyArray();
        Object[] cloneNewKeysArray = new Object[newKeysArray.length];
        System.arraycopy(newKeysArray, 0, cloneNewKeysArray, 0, groupingSetsPosition);

        for (int groupingSetPos = 0; groupingSetPos < groupingSets.size(); groupingSetPos++) {
          Arrays.fill(newKeysArray, 0, groupingSetsPosition, null);

          FastBitSet bitset = groupingSetsBitSet[groupingSetPos];
          // Some keys need to be left to null corresponding to that grouping set.
          for (int keyPos = bitset.nextClearBit(0); keyPos < groupingSetsPosition;
                  keyPos = bitset.nextClearBit(keyPos+1)) {
            newKeysArray[keyPos] = cloneNewKeysArray[keyPos];
          }

          newKeysArray[groupingSetsPosition] = newKeysGroupingSets[groupingSetPos];
          processKey(row, rowInspector);
        }
      } else {
        processKey(row, rowInspector);
      }
    } catch (HiveException e) {
      throw e;
    } catch (Exception e) {
      throw new HiveException(e);
    }
  }

  private void processHashAggr(Object row, ObjectInspector rowInspector,
      KeyWrapper newKeys) throws HiveException {
    // Prepare aggs for updating
    AggregationBuffer[] aggs = null;
    boolean newEntryForHashAggr = false;

    // hash-based aggregations
    aggs = hashAggregations.get(newKeys);
    if (aggs == null) {
      KeyWrapper newKeyProber = newKeys.copyKey();
      aggs = newAggregations();
      hashAggregations.put(newKeyProber, aggs);
      newEntryForHashAggr = true;
      numRowsHashTbl++; // new entry in the hash table
    }

    // Update the aggs
    updateAggregations(aggs, row, rowInspector, true, newEntryForHashAggr, null);

    // We can only flush after the updateAggregations is done, or the
    // potentially new entry "aggs"
    // can be flushed out of the hash table.

    // Based on user-specified parameters, check if the hash table needs to be
    // flushed.
    if (shouldBeFlushed(newKeys)) {
      flushHashTable(false);
    }
  }

  // Non-hash aggregation
  private void processAggr(Object row,
      ObjectInspector rowInspector,
      KeyWrapper newKeys) throws HiveException {
    // Prepare aggs for updating
    AggregationBuffer[] aggs = null;
    Object[][] lastInvoke = null;
    //boolean keysAreEqual = (currentKeys != null && newKeys != null)?
    //  newKeyStructEqualComparer.areEqual(currentKeys, newKeys) : false;

    boolean keysAreEqual = (currentKeys != null && newKeys != null)?
        newKeys.equals(currentKeys) : false;

    // Forward the current keys if needed for sort-based aggregation
    if (currentKeys != null && !keysAreEqual) {
      // This is to optimize queries of the form:
      // select count(distinct key) from T
      // where T is sorted and bucketized by key
      // Partial aggregation is performed on the mapper, and the
      // reducer gets 1 row (partial result) per mapper.
      if (!conf.isDontResetAggrsDistinct()) {
        forward(currentKeys.getKeyArray(), aggregations);
        countAfterReport = 0;
      }
    }

    // Need to update the keys?
    if (currentKeys == null || !keysAreEqual) {
      if (currentKeys == null) {
        currentKeys = newKeys.copyKey();
      } else {
        currentKeys.copyKey(newKeys);
      }

      // Reset the aggregations
      // For distincts optimization with sorting/bucketing, perform partial aggregation
      if (!conf.isDontResetAggrsDistinct()) {
        resetAggregations(aggregations);
      }

      // clear parameters in last-invoke
      Arrays.fill(aggregationsParametersLastInvoke, 0, aggregationsParametersLastInvoke.length, null);
    }

    aggs = aggregations;

    lastInvoke = aggregationsParametersLastInvoke;
    // Update the aggs

    updateAggregations(aggs, row, rowInspector, false, false, lastInvoke);
  }

  /**
   * Based on user-parameters, should the hash table be flushed.
   *
   * @param newKeys
   *          keys for the row under consideration
   **/
  private boolean shouldBeFlushed(KeyWrapper newKeys) {
    int numEntries = hashAggregations.size();
    long usedMemory;
    float rate;

    // The fixed size for the aggregation class is already known. Get the
    // variable portion of the size every NUMROWSESTIMATESIZE rows.
    if ((numEntriesHashTable == 0) || ((numEntries % NUMROWSESTIMATESIZE) == 0)) {
      //check how much memory left memory
      usedMemory = memoryMXBean.getHeapMemoryUsage().getUsed();
      // TODO: there is no easy and reliable way to compute the memory used by the executor threads and on-heap cache.
      // Assuming the used memory is equally divided among all executors.
      usedMemory = isLlap ? usedMemory / numExecutors : usedMemory;
      rate = (float) usedMemory / (float) maxMemory;
      if (rate > memoryThreshold) {
        return (!isTez || numEntriesHashTable != 0);
      }
      for (Integer pos : keyPositionsSize) {
        Object key = newKeys.getKeyArray()[pos.intValue()];
        // Ignore nulls
        if (key != null) {
          if (key instanceof LazyString) {
              totalVariableSize +=
                  ((LazyPrimitive<LazyStringObjectInspector, Text>) key).
                      getWritableObject().getLength();
          } else if (key instanceof String) {
            totalVariableSize += ((String) key).length();
          } else if (key instanceof Text) {
            totalVariableSize += ((Text) key).getLength();
          } else if (key instanceof LazyBinary) {
            totalVariableSize +=
                ((LazyPrimitive<LazyBinaryObjectInspector, BytesWritable>) key).
                    getWritableObject().getLength();
          } else if (key instanceof BytesWritable) {
            totalVariableSize += ((BytesWritable) key).getLength();
          } else if (key instanceof ByteArrayRef) {
            totalVariableSize += ((ByteArrayRef) key).getData().length;
          }
        }
      }

      AggregationBuffer[] aggs = hashAggregations.get(newKeys);
      for (int i = 0; i < aggs.length; i++) {
        AggregationBuffer agg = aggs[i];
        if (estimableAggregationEvaluators[i]) {
          totalVariableSize += ((GenericUDAFEvaluator.AbstractAggregationBuffer)agg).estimate();
          continue;
        }
        if (aggrPositions[i] != null) {
          totalVariableSize += estimateSize(agg, aggrPositions[i]);
        }
      }

      numEntriesVarSize++;

      // Update the number of entries that can fit in the hash table
      numEntriesHashTable =
          (int) (maxHashTblMemory / (fixedRowSize + (totalVariableSize / numEntriesVarSize)));
      LOG.trace("Hash Aggr: #hash table = {} #max in hash table = {}", numEntries, numEntriesHashTable);
    }

    // flush if necessary
    return (numEntries >= numEntriesHashTable);
  }

  private int estimateSize(AggregationBuffer agg, List<Field> fields) {
    int length = 0;
    for (Field f : fields) {
      try {
        Object o = f.get(agg);
        if (o instanceof String){
          length += ((String)o).length();
        }
        else if (o instanceof ByteArrayRef){
          length += ((ByteArrayRef)o).getData().length;
        }
      } catch (Exception e) {
        // continue.. null out the field?
      }
    }
    return length;
  }

  /**
   * Flush hash table. This method is used by hash-based aggregations
   * @param complete
   * @throws HiveException
   */
  private void flushHashTable(boolean complete) throws HiveException {
    countAfterReport = 0;

    // Currently, the algorithm flushes 10% of the entries - this can be
    // changed in the future

    if (complete) {
      for (Map.Entry<KeyWrapper, AggregationBuffer[]> entry : hashAggregations.entrySet()) {
        forward(entry.getKey().getKeyArray(), entry.getValue());
      }
      hashAggregations = null;
      LOG.info("Hash Table completed flushed");
      return;
    }

    int oldSize = hashAggregations.size();
    LOG.info("Hash Tbl flush: #hash table = {}", oldSize);

    Iterator<Map.Entry<KeyWrapper, AggregationBuffer[]>> iter = hashAggregations
        .entrySet().iterator();
    int numDel = 0;
    while (iter.hasNext()) {
      Map.Entry<KeyWrapper, AggregationBuffer[]> m = iter.next();
      forward(m.getKey().getKeyArray(), m.getValue());
      iter.remove();
      numDel++;
      if (numDel * 10 >= oldSize) {
        LOG.info("Hash Table flushed: new size = {}", hashAggregations.size());
        return;
      }
    }
  }

  transient Object[] forwardCache;

  /**
   * Forward a record of keys and aggregation results.
   *
   * @param keys
   *          The keys in the record
   * @throws HiveException
   */
  private void forward(Object[] keys, AggregationBuffer[] aggs) throws HiveException {
    if (forwardCache == null) {
      forwardCache = new Object[outputKeyLength + aggs.length];
    }

    System.arraycopy(keys, 0, forwardCache, 0, outputKeyLength);

    for (int i = 0; i < aggs.length; i++) {
      forwardCache[outputKeyLength + i] = aggregationEvaluators[i].evaluate(aggs[i]);
    }

    forward(forwardCache, outputObjInspector);
  }

  /**
   * Forward all aggregations to children. It is only used by DemuxOperator.
   * @throws HiveException
   */
  @Override
  public void flush() throws HiveException{
    try {
      if (hashAggregations != null) {
        LOG.info("Begin Hash Table flush: size = {}", hashAggregations.size());
        Iterator iter = hashAggregations.entrySet().iterator();
        while (iter.hasNext()) {
          Map.Entry<KeyWrapper, AggregationBuffer[]> m = (Map.Entry) iter
              .next();

          forward(m.getKey().getKeyArray(), m.getValue());
          iter.remove();
        }
        hashAggregations.clear();
      } else if (aggregations != null) {
        // sort-based aggregations
        if (currentKeys != null) {
          forward(currentKeys.getKeyArray(), aggregations);
        }
        currentKeys = null;
      } else {
        // The GroupByOperator is not initialized, which means there is no
        // data
        // (since we initialize the operators when we see the first record).
        // Just do nothing here.
      }
    } catch (Exception e) {
      throw new HiveException(e);
    }
  }

  /**
   * We need to forward all the aggregations to children.
   *
   */
  @Override
  public void closeOp(boolean abort) throws HiveException {
    if (!abort) {
      try {
        // If there is no grouping key and no row came to this operator
        if (firstRow && GroupByOperator.shouldEmitSummaryRow(conf)) {
          firstRow = false;

          Object[] keys = new Object[outputKeyLength];
          int pos = conf.getGroupingSetPosition();
          if (pos >= 0 && pos < outputKeyLength) {
            keys[pos] = new LongWritable((1L << pos) - 1);
          }
          forward(keys, aggregations);
        } else {
          flush();
        }
      } catch (Exception e) {
        throw new HiveException(e);
      }
    }
    hashAggregations = null;
    super.closeOp(abort);
  }

  // Group by contains the columns needed - no need to aggregate from children
  public List<String> genColLists(
      HashMap<Operator<? extends OperatorDesc>, OpParseContext> opParseCtx) {
    List<String> colLists = new ArrayList<String>();
    List<ExprNodeDesc> keys = conf.getKeys();
    for (ExprNodeDesc key : keys) {
      colLists = Utilities.mergeUniqElems(colLists, key.getCols());
    }

    List<AggregationDesc> aggrs = conf.getAggregators();
    for (AggregationDesc aggr : aggrs) {
      List<ExprNodeDesc> params = aggr.getParameters();
      for (ExprNodeDesc param : params) {
        colLists = Utilities.mergeUniqElems(colLists, param.getCols());
      }
    }

    return colLists;
  }

  /**
   * @return the name of the operator
   */
  @Override
  public String getName() {
    return GroupByOperator.getOperatorName();
  }

  static public String getOperatorName() {
    return "GBY";
  }

  @Override
  public OperatorType getType() {
    return OperatorType.GROUPBY;
  }

  /**
   * we can push the limit above GBY (running in Reducer), since that will generate single row
   * for each group. This doesn't necessarily hold for GBY (running in Mappers),
   * so we don't push limit above it.
   */
  @Override
  public boolean acceptLimitPushdown() {
    return getConf().getMode() == GroupByDesc.Mode.MERGEPARTIAL ||
        getConf().getMode() == GroupByDesc.Mode.COMPLETE;
  }

  public static boolean shouldEmitSummaryRow(GroupByDesc desc) {
    // empty keyset is basically ()
    if (desc.getKeys().size() == 0) {
      return true;
    }

    if (desc.getMode() != Mode.HASH && desc.getMode() != Mode.COMPLETE && desc.getMode() != Mode.PARTIAL1) {
      return false;
    }

    int groupingSetPosition = desc.getGroupingSetPosition();
    List<Long> listGroupingSets = desc.getListGroupingSets();
    // groupingSets are known at map/reducer side; but have to do real processing
    // hence groupingSetsPresent is true only at map side
    if (groupingSetPosition >= 0 && listGroupingSets != null) {
      Long emptyGrouping = (1L << groupingSetPosition) - 1;
      if (listGroupingSets.contains(emptyGrouping)) {
        return true;
      }
    }
    return false;
  }

  @Override
  public void configureJobConf(JobConf job) {
    // only needed when grouping sets are present
    if (conf.getGroupingSetPosition() > 0 && shouldEmitSummaryRow(conf)) {
      job.setBoolean(Utilities.ENSURE_OPERATORS_EXECUTED, true);
    }
  }

  @Override
  public void replaceTabAlias(String oldAlias, String newAlias) {
    super.replaceTabAlias(oldAlias, newAlias);
    ExprNodeDescUtils.replaceTabAlias(getConf().getKeys(), oldAlias, newAlias);
  }

}