Bug #32169846: HYPERGRAPH: ASSERTION `M_OPENED_TABLE != NULLPTR' FAILED.

Steinar H. Gunderson · Steinar H. Gunderson · commit cb760fd3eb2a · 2020-11-26T10:34:56.000+01:00
If we are sorting by row ID, make sure not to use streaming for any
derived tables, as it cannot deliver old rows (by definition).

Change-Id: Ib4be21201e8eadaa45444b5a926fd684b72d7cac
diff --git a/mysql-test/r/derived.result b/mysql-test/r/derived.result
@@ -4595,3 +4595,17 @@ Warnings:
 Warning	1292	Incorrect datetime value: '1'
 SET sql_mode=DEFAULT;
 DROP TABLE t1;
+#
+# Bug #32169846: HYPERGRAPH: ASSERTION `M_OPENED_TABLE != NULLPTR' FAILED.
+#
+CREATE TABLE t1 (a LONGTEXT);
+INSERT INTO t1 VALUES ('');
+CREATE TABLE t2 (b INTEGER);
+INSERT INTO t2 VALUES (0);
+SELECT 1 FROM t2, (
+SELECT a, rand() FROM t1 GROUP BY a
+) d1
+GROUP BY b;
+1
+1
+DROP TABLE t1, t2;
diff --git a/mysql-test/t/derived.test b/mysql-test/t/derived.test
@@ -3280,3 +3280,24 @@ SELECT (
 ) FROM t1;
 SET sql_mode=DEFAULT;
 DROP TABLE t1;
+
+--echo #
+--echo # Bug #32169846: HYPERGRAPH: ASSERTION `M_OPENED_TABLE != NULLPTR' FAILED.
+--echo #
+
+CREATE TABLE t1 (a LONGTEXT);
+INSERT INTO t1 VALUES ('');
+CREATE TABLE t2 (b INTEGER);
+INSERT INTO t2 VALUES (0);
+
+# The sort for GROUP BY needs to be on row ID due to the long blob in t1.
+# This is incompatible with streaming of d1, so we want to check that it
+# uses materialization instead. The rand() is there to force rematerialization
+# every time (otherwise, streaming is not considered). The inner GROUP BY
+# is there to preclude merging.
+SELECT 1 FROM t2, (
+  SELECT a, rand() FROM t1 GROUP BY a
+) d1
+GROUP BY b;
+
+DROP TABLE t1, t2;
diff --git a/sql/filesort.cc b/sql/filesort.cc
@@ -2096,6 +2096,37 @@ uint sortlength(THD *thd, st_sort_field *sortorder, uint s_length) {
   return total_length;
 }
 
+bool SortWillBeOnRowId(TABLE *table) {
+  for (Field **pfield = table->field; *pfield != nullptr; ++pfield) {
+    Field *field = *pfield;
+    if (!bitmap_is_set(table->read_set, field->field_index())) continue;
+
+    // Having large blobs in addon fields could be very inefficient,
+    // but small blobs are OK (where “small” is a bit fuzzy, and relative
+    // to the size of the sort buffer). There are two types of small blobs:
+    //
+    //  - Those explicitly bounded to small lengths, namely tinyblob
+    //    (255 bytes) and blob (65535 bytes).
+    //  - Those that are _typically_ fairly small, which includes JSON and
+    //    geometries. We don't actually declare anywhere that they are
+    //    implemented using blobs under the hood, so it's not unreasonable to
+    //    demand that the user have large enough sort buffers for a few rows.
+    //    (If a user has multi-megabyte JSON rows and wishes to sort them,
+    //    they would usually have a fair bit of RAM anyway, since they'd need
+    //    that to hold the result set and process it in a reasonable fashion.)
+    //
+    // That leaves only mediumblob and longblob. If a user declares a field as
+    // one of those, it's reasonable for them to expect that sorting doesn't
+    // need to pull many of them up in memory, so we should stick to sorting
+    // row IDs.
+    if (field->type() == MYSQL_TYPE_BLOB &&
+        field->max_packed_col_length() > 70000u) {
+      return true;
+    }
+  }
+  return false;
+}
+
 /**
   Get descriptors of fields appended to sorted fields and
   calculate their total length.
@@ -2146,35 +2177,15 @@ Addon_fields *Filesort::get_addon_fields(
     if (table->is_nullable()) {
       null_fields++;
     }
+    if (SortWillBeOnRowId(table)) {
+      DBUG_ASSERT(m_sort_param.addon_fields == nullptr);
+      *addon_fields_status = Addon_fields_status::row_contains_blob;
+      return nullptr;
+    }
     for (Field **pfield = table->field; *pfield != nullptr; ++pfield) {
       Field *field = *pfield;
       if (!bitmap_is_set(table->read_set, field->field_index())) continue;
 
-      // Having large blobs in addon fields could be very inefficient,
-      // but small blobs are OK (where “small” is a bit fuzzy, and relative
-      // to the size of the sort buffer). There are two types of small blobs:
-      //
-      //  - Those explicitly bounded to small lengths, namely tinyblob
-      //    (255 bytes) and blob (65535 bytes).
-      //  - Those that are _typically_ fairly small, which includes JSON and
-      //    geometries. We don't actually declare anywhere that they are
-      //    implemented using blobs under the hood, so it's not unreasonable to
-      //    demand that the user have large enough sort buffers for a few rows.
-      //    (If a user has multi-megabyte JSON rows and wishes to sort them,
-      //    they would usually have a fair bit of RAM anyway, since they'd need
-      //    that to hold the result set and process it in a reasonable fashion.)
-      //
-      // That leaves only mediumblob and longblob. If a user declares a field as
-      // one of those, it's reasonable for them to expect that sorting doesn't
-      // need to pull many of them up in memory, so we should stick to sorting
-      // row IDs.
-      if (field->type() == MYSQL_TYPE_BLOB &&
-          field->max_packed_col_length() > 70000u) {
-        DBUG_ASSERT(m_sort_param.addon_fields == nullptr);
-        *addon_fields_status = Addon_fields_status::row_contains_blob;
-        return nullptr;
-      }
-
       const uint field_length = field->max_packed_col_length();
       AddWithSaturate(field_length, &total_length);
 
diff --git a/sql/filesort.h b/sql/filesort.h
@@ -118,6 +118,10 @@ template <bool Is_big_endian>
 void copy_integer(uchar *to, size_t to_length, const uchar *from,
                   size_t from_length, bool is_unsigned);
 
+// Returns whether a sort involving this table would necessarily be on row ID,
+// even if not forced by other means.
+bool SortWillBeOnRowId(TABLE *table);
+
 static inline void copy_native_longlong(uchar *to, size_t to_length,
                                         longlong val, bool is_unsigned) {
 #ifdef WORDS_BIGENDIAN
diff --git a/sql/join_optimizer/join_optimizer.cc b/sql/join_optimizer/join_optimizer.cc
@@ -112,12 +112,13 @@ constexpr double kMaterializeOneRowCost = 0.1;
 class CostingReceiver {
  public:
   CostingReceiver(
-      THD *thd, const JoinHypergraph &graph,
+      THD *thd, const JoinHypergraph &graph, bool need_rowid,
       uint64_t supported_access_path_types,
       secondary_engine_modify_access_path_cost_t secondary_engine_cost_hook,
       string *trace)
       : m_thd(thd),
         m_graph(graph),
+        m_need_rowid(need_rowid),
         m_supported_access_path_types(supported_access_path_types),
         m_secondary_engine_cost_hook(secondary_engine_cost_hook),
         m_trace(trace) {
@@ -162,6 +163,14 @@ class CostingReceiver {
   /// The graph we are running over.
   const JoinHypergraph &m_graph;
 
+  /// Whether we will be needing row IDs from our tables, typically for
+  /// a later sort. If this happens, derived tables cannot use streaming,
+  /// but need an actual materialization, since filesort expects to be
+  /// able to go back and ask for a given row. (This is different from
+  /// when we need row IDs for weedout, which doesn't preclude streaming.
+  /// The hypergraph optimizer does not use weedout.)
+  bool m_need_rowid;
+
   /// The supported access path types. Access paths of types not in
   /// this set should not be created. It is currently only used to
   /// limit which join types to use, so any bit that does not
@@ -588,9 +597,9 @@ bool CostingReceiver::ProposeTableScan(TABLE *table, int node_idx) {
         // Handled in clear_corr_something_something, not here
         rematerialize = false;
       }
-      materialize_path =
-          GetAccessPathForDerivedTable(m_thd, tl, table, rematerialize,
-                                       /*invalidators=*/nullptr, path);
+      materialize_path = GetAccessPathForDerivedTable(
+          m_thd, tl, table, rematerialize,
+          /*invalidators=*/nullptr, m_need_rowid, path);
     }
 
     // TODO(sgunders): Take rematerialization cost into account,
@@ -1588,6 +1597,19 @@ AccessPath *FindBestQueryPlan(THD *thd, SELECT_LEX *select_lex, string *trace) {
     }
   }
 
+  // Figure out if any later sort will need row IDs.
+  bool need_rowid = false;
+  if (select_lex->is_explicitly_grouped() || select_lex->is_ordered() ||
+      join->select_distinct) {
+    for (TABLE_LIST *tl = select_lex->leaf_tables; tl != nullptr;
+         tl = tl->next_leaf) {
+      if (SortWillBeOnRowId(tl->table)) {
+        need_rowid = true;
+        break;
+      }
+    }
+  }
+
   // Run the actual join optimizer algorithm. This creates an access path
   // for the join as a whole (with lowest possible cost, and thus also
   // hopefully optimal execution time), with all pushable predicates applied.
@@ -1599,7 +1621,8 @@ AccessPath *FindBestQueryPlan(THD *thd, SELECT_LEX *select_lex, string *trace) {
   }
   const secondary_engine_modify_access_path_cost_t secondary_engine_cost_hook =
       SecondaryEngineCostHook(thd);
-  CostingReceiver receiver(thd, graph, SupportedAccessPathTypes(thd),
+  CostingReceiver receiver(thd, graph, need_rowid,
+                           SupportedAccessPathTypes(thd),
                            secondary_engine_cost_hook, trace);
   if (EnumerateAllConnectedPartitions(graph.graph, &receiver) &&
       !thd->is_error()) {
diff --git a/sql/sql_executor.cc b/sql/sql_executor.cc
@@ -1450,14 +1450,15 @@ static bool IsTableScan(AccessPath *path) {
 
 AccessPath *GetAccessPathForDerivedTable(THD *thd, QEP_TAB *qep_tab,
                                          AccessPath *table_path) {
-  return GetAccessPathForDerivedTable(thd, qep_tab->table_ref, qep_tab->table(),
-                                      qep_tab->rematerialize,
-                                      qep_tab->invalidators, table_path);
+  return GetAccessPathForDerivedTable(
+      thd, qep_tab->table_ref, qep_tab->table(), qep_tab->rematerialize,
+      qep_tab->invalidators, /*need_rowid=*/false, table_path);
 }
 
 AccessPath *GetAccessPathForDerivedTable(
     THD *thd, TABLE_LIST *table_ref, TABLE *table, bool rematerialize,
-    Mem_root_array<const AccessPath *> *invalidators, AccessPath *table_path) {
+    Mem_root_array<const AccessPath *> *invalidators, bool need_rowid,
+    AccessPath *table_path) {
   SELECT_LEX_UNIT *unit = table_ref->derived_unit();
   JOIN *subjoin = nullptr;
   Temp_table_param *tmp_table_param;
@@ -1509,7 +1510,7 @@ AccessPath *GetAccessPathForDerivedTable(
           /*send_records_override=*/nullptr);
     }
   } else if (table_ref->common_table_expr() == nullptr && rematerialize &&
-             IsTableScan(table_path)) {
+             IsTableScan(table_path) && !need_rowid) {
     // We don't actually need the materialization for anything (we would
     // just be reading the rows straight out from the table, never to be used
     // again), so we can just stream records directly over to the next
diff --git a/sql/sql_executor.h b/sql/sql_executor.h
@@ -566,7 +566,8 @@ AccessPath *GetAccessPathForDerivedTable(THD *thd, QEP_TAB *qep_tab,
                                          AccessPath *table_path);
 AccessPath *GetAccessPathForDerivedTable(
     THD *thd, TABLE_LIST *table_ref, TABLE *table, bool rematerialize,
-    Mem_root_array<const AccessPath *> *invalidators, AccessPath *table_path);
+    Mem_root_array<const AccessPath *> *invalidators, bool need_rowid,
+    AccessPath *table_path);
 
 void ConvertItemsToCopy(const mem_root_deque<Item *> &items, Field **fields,
                         Temp_table_param *param);