forked from ClickHouse/ClickHouse
-
Notifications
You must be signed in to change notification settings - Fork 0
/
QueryAnalysisPass.cpp
7417 lines (6145 loc) · 333 KB
/
QueryAnalysisPass.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#include <Analyzer/Passes/QueryAnalysisPass.h>
#include <Common/checkStackSize.h>
#include <Common/NamePrompter.h>
#include <Common/ProfileEvents.h>
#include <IO/WriteBuffer.h>
#include <IO/WriteHelpers.h>
#include <IO/Operators.h>
#include <DataTypes/IDataType.h>
#include <DataTypes/DataTypesNumber.h>
#include <DataTypes/DataTypeString.h>
#include <DataTypes/DataTypeNullable.h>
#include <DataTypes/DataTypeObject.h>
#include <DataTypes/DataTypeTuple.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeMap.h>
#include <DataTypes/DataTypeFunction.h>
#include <DataTypes/DataTypeSet.h>
#include <DataTypes/DataTypeLowCardinality.h>
#include <DataTypes/getLeastSupertype.h>
#include <Columns/ColumnNullable.h>
#include <Columns/ColumnSet.h>
#include <Columns/ColumnConst.h>
#include <Functions/FunctionFactory.h>
#include <Functions/UserDefined/UserDefinedExecutableFunctionFactory.h>
#include <Functions/UserDefined/UserDefinedSQLFunctionFactory.h>
#include <Functions/grouping.h>
#include <AggregateFunctions/AggregateFunctionFactory.h>
#include <TableFunctions/TableFunctionFactory.h>
#include <Formats/FormatFactory.h>
#include <Databases/IDatabase.h>
#include <Storages/IStorage.h>
#include <Storages/StorageSet.h>
#include <Storages/StorageJoin.h>
#include <Interpreters/misc.h>
#include <Interpreters/convertFieldToType.h>
#include <Interpreters/StorageID.h>
#include <Interpreters/SelectQueryOptions.h>
#include <Interpreters/Set.h>
#include <Interpreters/Context.h>
#include <Interpreters/ExternalDictionariesLoader.h>
#include <Interpreters/InterpreterSelectQueryAnalyzer.h>
#include <Processors/Executors/PullingAsyncPipelineExecutor.h>
#include <Analyzer/createUniqueTableAliases.h>
#include <Analyzer/Utils.h>
#include <Analyzer/SetUtils.h>
#include <Analyzer/AggregationUtils.h>
#include <Analyzer/WindowFunctionsUtils.h>
#include <Analyzer/ValidationUtils.h>
#include <Analyzer/HashUtils.h>
#include <Analyzer/IdentifierNode.h>
#include <Analyzer/MatcherNode.h>
#include <Analyzer/ColumnTransformers.h>
#include <Analyzer/ConstantNode.h>
#include <Analyzer/ColumnNode.h>
#include <Analyzer/FunctionNode.h>
#include <Analyzer/LambdaNode.h>
#include <Analyzer/SortNode.h>
#include <Analyzer/InterpolateNode.h>
#include <Analyzer/WindowNode.h>
#include <Analyzer/TableNode.h>
#include <Analyzer/TableFunctionNode.h>
#include <Analyzer/QueryNode.h>
#include <Analyzer/ArrayJoinNode.h>
#include <Analyzer/JoinNode.h>
#include <Analyzer/UnionNode.h>
#include <Analyzer/InDepthQueryTreeVisitor.h>
#include <Analyzer/QueryTreeBuilder.h>
#include <Analyzer/IQueryTreeNode.h>
#include <Analyzer/Identifier.h>
namespace ProfileEvents
{
extern const Event ScalarSubqueriesGlobalCacheHit;
extern const Event ScalarSubqueriesCacheMiss;
}
namespace DB
{
namespace ErrorCodes
{
extern const int UNSUPPORTED_METHOD;
extern const int UNKNOWN_IDENTIFIER;
extern const int UNKNOWN_FUNCTION;
extern const int LOGICAL_ERROR;
extern const int CYCLIC_ALIASES;
extern const int INCORRECT_RESULT_OF_SCALAR_SUBQUERY;
extern const int BAD_ARGUMENTS;
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
extern const int MULTIPLE_EXPRESSIONS_FOR_ALIAS;
extern const int TYPE_MISMATCH;
extern const int AMBIGUOUS_IDENTIFIER;
extern const int INVALID_WITH_FILL_EXPRESSION;
extern const int INVALID_LIMIT_EXPRESSION;
extern const int EMPTY_LIST_OF_COLUMNS_QUERIED;
extern const int TOO_DEEP_SUBQUERIES;
extern const int UNKNOWN_AGGREGATE_FUNCTION;
extern const int TOO_FEW_ARGUMENTS_FOR_FUNCTION;
extern const int TOO_MANY_ARGUMENTS_FOR_FUNCTION;
extern const int ILLEGAL_FINAL;
extern const int SAMPLING_NOT_SUPPORTED;
extern const int NO_COMMON_TYPE;
extern const int NOT_IMPLEMENTED;
extern const int ALIAS_REQUIRED;
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
extern const int UNKNOWN_TABLE;
extern const int ILLEGAL_COLUMN;
extern const int NUMBER_OF_COLUMNS_DOESNT_MATCH;
extern const int FUNCTION_CANNOT_HAVE_PARAMETERS;
extern const int SYNTAX_ERROR;
extern const int UNEXPECTED_EXPRESSION;
}
/** Query analyzer implementation overview. Please check documentation in QueryAnalysisPass.h first.
* And additional documentation for each method, where special cases are described in detail.
*
* Each node in query must be resolved. For each query tree node resolved state is specific.
*
* For constant node no resolve process exists, it is resolved during construction.
*
* For table node no resolve process exists, it is resolved during construction.
*
* For function node to be resolved parameters and arguments must be resolved, function node must be initialized with concrete aggregate or
* non aggregate function and with result type.
*
* For lambda node there can be 2 different cases.
* 1. Standalone: WITH (x -> x + 1) AS lambda SELECT lambda(1); Such lambdas are inlined in query tree during query analysis pass.
* 2. Function arguments: WITH (x -> x + 1) AS lambda SELECT arrayMap(lambda, [1, 2, 3]); For such lambda resolution must
* set concrete lambda arguments (initially they are identifier nodes) and resolve lambda expression body.
*
* For query node resolve process must resolve all its inner nodes.
*
* For matcher node resolve process must replace it with matched nodes.
*
* For identifier node resolve process must replace it with concrete non identifier node. This part is most complex because
* for identifier resolution scopes and identifier lookup context play important part.
*
* ClickHouse SQL support lexical scoping for identifier resolution. Scope can be defined by query node or by expression node.
* Expression nodes that can define scope are lambdas and table ALIAS columns.
*
* Identifier lookup context can be expression, function, table.
*
* Examples: WITH (x -> x + 1) as func SELECT func() FROM func; During function `func` resolution identifier lookup is performed
* in function context.
*
* If there are no information of identifier context rules are following:
* 1. Try to resolve identifier in expression context.
* 2. Try to resolve identifier in function context, if it is allowed. Example: SELECT func(arguments); Here func identifier cannot be resolved in function context
* because query projection does not support that.
* 3. Try to resolve identifier in table context, if it is allowed. Example: SELECT table; Here table identifier cannot be resolved in function context
* because query projection does not support that.
*
* TODO: This does not supported properly before, because matchers could not be resolved from aliases.
*
* Identifiers are resolved with following rules:
* Resolution starts with current scope.
* 1. Try to resolve identifier from expression scope arguments. Lambda expression arguments are greatest priority.
* 2. Try to resolve identifier from aliases.
* 3. Try to resolve identifier from join tree if scope is query, or if there are registered table columns in scope.
* Steps 2 and 3 can be changed using prefer_column_name_to_alias setting.
* 4. If it is table lookup, try to resolve identifier from CTE.
* If identifier could not be resolved in current scope, resolution must be continued in parent scopes.
* 5. Try to resolve identifier from parent scopes.
*
* Additional rules about aliases and scopes.
* 1. Parent scope cannot refer alias from child scope.
* 2. Child scope can refer to alias in parent scope.
*
* Example: SELECT arrayMap(x -> x + 1 AS a, [1,2,3]), a; Identifier a is unknown in parent scope.
* Example: SELECT a FROM (SELECT 1 as a); Here we do not refer to alias a from child query scope. But we query it projection result, similar to tables.
* Example: WITH 1 as a SELECT (SELECT a) as b; Here in child scope identifier a is resolved using alias from parent scope.
*
* Additional rules about identifier binding.
* Bind for identifier to entity means that identifier first part match some node during analysis.
* If other parts of identifier cannot be resolved in that node, exception must be thrown.
*
* Example:
* CREATE TABLE test_table (id UInt64, compound_value Tuple(value UInt64)) ENGINE=TinyLog;
* SELECT compound_value.value, 1 AS compound_value FROM test_table;
* Identifier first part compound_value bound to entity with alias compound_value, but nested identifier part cannot be resolved from entity,
* lookup should not be continued, and exception must be thrown because if lookup continues that way identifier can be resolved from join tree.
*
* TODO: This was not supported properly before analyzer because nested identifier could not be resolved from alias.
*
* More complex example:
* CREATE TABLE test_table (id UInt64, value UInt64) ENGINE=TinyLog;
* WITH cast(('Value'), 'Tuple (value UInt64') AS value SELECT (SELECT value FROM test_table);
* Identifier first part value bound to test_table column value, but nested identifier part cannot be resolved from it,
* lookup should not be continued, and exception must be thrown because if lookup continues identifier can be resolved from parent scope.
*
* TODO: Update exception messages
* TODO: Table identifiers with optional UUID.
* TODO: Lookup functions arrayReduce(sum, [1, 2, 3]);
* TODO: Support function identifier resolve from parent query scope, if lambda in parent scope does not capture any columns.
*/
namespace
{
/// Identifier lookup context
enum class IdentifierLookupContext : uint8_t
{
EXPRESSION = 0,
FUNCTION,
TABLE_EXPRESSION,
};
const char * toString(IdentifierLookupContext identifier_lookup_context)
{
switch (identifier_lookup_context)
{
case IdentifierLookupContext::EXPRESSION: return "EXPRESSION";
case IdentifierLookupContext::FUNCTION: return "FUNCTION";
case IdentifierLookupContext::TABLE_EXPRESSION: return "TABLE_EXPRESSION";
}
}
const char * toStringLowercase(IdentifierLookupContext identifier_lookup_context)
{
switch (identifier_lookup_context)
{
case IdentifierLookupContext::EXPRESSION: return "expression";
case IdentifierLookupContext::FUNCTION: return "function";
case IdentifierLookupContext::TABLE_EXPRESSION: return "table expression";
}
}
/** Structure that represent identifier lookup during query analysis.
* Lookup can be in query expression, function, table context.
*/
struct IdentifierLookup
{
Identifier identifier;
IdentifierLookupContext lookup_context;
bool isExpressionLookup() const
{
return lookup_context == IdentifierLookupContext::EXPRESSION;
}
bool isFunctionLookup() const
{
return lookup_context == IdentifierLookupContext::FUNCTION;
}
bool isTableExpressionLookup() const
{
return lookup_context == IdentifierLookupContext::TABLE_EXPRESSION;
}
String dump() const
{
return identifier.getFullName() + ' ' + toString(lookup_context);
}
};
inline bool operator==(const IdentifierLookup & lhs, const IdentifierLookup & rhs)
{
return lhs.identifier.getFullName() == rhs.identifier.getFullName() && lhs.lookup_context == rhs.lookup_context;
}
[[maybe_unused]] inline bool operator!=(const IdentifierLookup & lhs, const IdentifierLookup & rhs)
{
return !(lhs == rhs);
}
struct IdentifierLookupHash
{
size_t operator()(const IdentifierLookup & identifier_lookup) const
{
return std::hash<std::string>()(identifier_lookup.identifier.getFullName()) ^ static_cast<uint8_t>(identifier_lookup.lookup_context);
}
};
enum class IdentifierResolvePlace : UInt8
{
NONE = 0,
EXPRESSION_ARGUMENTS,
ALIASES,
JOIN_TREE,
/// Valid only for table lookup
CTE,
/// Valid only for table lookup
DATABASE_CATALOG
};
const char * toString(IdentifierResolvePlace resolved_identifier_place)
{
switch (resolved_identifier_place)
{
case IdentifierResolvePlace::NONE: return "NONE";
case IdentifierResolvePlace::EXPRESSION_ARGUMENTS: return "EXPRESSION_ARGUMENTS";
case IdentifierResolvePlace::ALIASES: return "ALIASES";
case IdentifierResolvePlace::JOIN_TREE: return "JOIN_TREE";
case IdentifierResolvePlace::CTE: return "CTE";
case IdentifierResolvePlace::DATABASE_CATALOG: return "DATABASE_CATALOG";
}
}
struct IdentifierResolveResult
{
IdentifierResolveResult() = default;
QueryTreeNodePtr resolved_identifier;
IdentifierResolvePlace resolve_place = IdentifierResolvePlace::NONE;
bool resolved_from_parent_scopes = false;
[[maybe_unused]] bool isResolved() const
{
return resolve_place != IdentifierResolvePlace::NONE;
}
[[maybe_unused]] bool isResolvedFromParentScopes() const
{
return resolved_from_parent_scopes;
}
[[maybe_unused]] bool isResolvedFromExpressionArguments() const
{
return resolve_place == IdentifierResolvePlace::EXPRESSION_ARGUMENTS;
}
[[maybe_unused]] bool isResolvedFromAliases() const
{
return resolve_place == IdentifierResolvePlace::ALIASES;
}
[[maybe_unused]] bool isResolvedFromJoinTree() const
{
return resolve_place == IdentifierResolvePlace::JOIN_TREE;
}
[[maybe_unused]] bool isResolvedFromCTEs() const
{
return resolve_place == IdentifierResolvePlace::CTE;
}
void dump(WriteBuffer & buffer) const
{
if (!resolved_identifier)
{
buffer << "unresolved";
return;
}
buffer << resolved_identifier->formatASTForErrorMessage() << " place " << toString(resolve_place) << " resolved from parent scopes " << resolved_from_parent_scopes;
}
[[maybe_unused]] String dump() const
{
WriteBufferFromOwnString buffer;
dump(buffer);
return buffer.str();
}
};
struct IdentifierResolveState
{
IdentifierResolveResult resolve_result;
bool cyclic_identifier_resolve = false;
};
struct IdentifierResolveSettings
{
/// Allow to check join tree during identifier resolution
bool allow_to_check_join_tree = true;
/// Allow to check CTEs during table identifier resolution
bool allow_to_check_cte = true;
/// Allow to check parent scopes during identifier resolution
bool allow_to_check_parent_scopes = true;
/// Allow to check database catalog during table identifier resolution
bool allow_to_check_database_catalog = true;
/// Allow to resolve subquery during identifier resolution
bool allow_to_resolve_subquery_during_identifier_resolution = true;
};
struct StringTransparentHash
{
using is_transparent = void;
using hash = std::hash<std::string_view>;
[[maybe_unused]] size_t operator()(const char * data) const
{
return hash()(data);
}
size_t operator()(std::string_view data) const
{
return hash()(data);
}
size_t operator()(const std::string & data) const
{
return hash()(data);
}
};
using ColumnNameToColumnNodeMap = std::unordered_map<std::string, ColumnNodePtr, StringTransparentHash, std::equal_to<>>;
struct TableExpressionData
{
std::string table_expression_name;
std::string table_expression_description;
std::string database_name;
std::string table_name;
bool should_qualify_columns = true;
NamesAndTypes column_names_and_types;
ColumnNameToColumnNodeMap column_name_to_column_node;
std::unordered_set<std::string, StringTransparentHash, std::equal_to<>> column_identifier_first_parts;
bool hasFullIdentifierName(IdentifierView identifier_view) const
{
return column_name_to_column_node.contains(identifier_view.getFullName());
}
bool canBindIdentifier(IdentifierView identifier_view) const
{
return column_identifier_first_parts.contains(identifier_view.at(0));
}
[[maybe_unused]] void dump(WriteBuffer & buffer) const
{
buffer << "Table expression name " << table_expression_name;
if (!table_expression_description.empty())
buffer << " table expression description " << table_expression_description;
if (!database_name.empty())
buffer << " database name " << database_name;
if (!table_name.empty())
buffer << " table name " << table_name;
buffer << " should qualify columns " << should_qualify_columns;
buffer << " columns size " << column_name_to_column_node.size() << '\n';
for (const auto & [column_name, column_node] : column_name_to_column_node)
buffer << "Column name " << column_name << " column node " << column_node->dumpTree() << '\n';
}
[[maybe_unused]] String dump() const
{
WriteBufferFromOwnString buffer;
dump(buffer);
return buffer.str();
}
};
class ExpressionsStack
{
public:
void pushNode(const QueryTreeNodePtr & node)
{
if (node->hasAlias())
{
const auto & node_alias = node->getAlias();
alias_name_to_expressions[node_alias].push_back(node);
}
if (const auto * function = node->as<FunctionNode>())
{
if (AggregateFunctionFactory::instance().isAggregateFunctionName(function->getFunctionName()))
++aggregate_functions_counter;
}
expressions.emplace_back(node);
}
void popNode()
{
const auto & top_expression = expressions.back();
const auto & top_expression_alias = top_expression->getAlias();
if (!top_expression_alias.empty())
{
auto it = alias_name_to_expressions.find(top_expression_alias);
auto & alias_expressions = it->second;
alias_expressions.pop_back();
if (alias_expressions.empty())
alias_name_to_expressions.erase(it);
}
if (const auto * function = top_expression->as<FunctionNode>())
{
if (AggregateFunctionFactory::instance().isAggregateFunctionName(function->getFunctionName()))
--aggregate_functions_counter;
}
expressions.pop_back();
}
[[maybe_unused]] const QueryTreeNodePtr & getRoot() const
{
return expressions.front();
}
const QueryTreeNodePtr & getTop() const
{
return expressions.back();
}
[[maybe_unused]] bool hasExpressionWithAlias(const std::string & alias) const
{
return alias_name_to_expressions.contains(alias);
}
bool hasAggregateFunction() const
{
return aggregate_functions_counter > 0;
}
QueryTreeNodePtr getExpressionWithAlias(const std::string & alias) const
{
auto expression_it = alias_name_to_expressions.find(alias);
if (expression_it == alias_name_to_expressions.end())
return {};
return expression_it->second.front();
}
[[maybe_unused]] size_t size() const
{
return expressions.size();
}
bool empty() const
{
return expressions.empty();
}
void dump(WriteBuffer & buffer) const
{
buffer << expressions.size() << '\n';
for (const auto & expression : expressions)
{
buffer << "Expression ";
buffer << expression->formatASTForErrorMessage();
const auto & alias = expression->getAlias();
if (!alias.empty())
buffer << " alias " << alias;
buffer << '\n';
}
}
[[maybe_unused]] String dump() const
{
WriteBufferFromOwnString buffer;
dump(buffer);
return buffer.str();
}
private:
QueryTreeNodes expressions;
size_t aggregate_functions_counter = 0;
std::unordered_map<std::string, QueryTreeNodes> alias_name_to_expressions;
};
/** Projection names is name of query tree node that is used in projection part of query node.
* Example: SELECT id FROM test_table;
* `id` is projection name of column node
*
* Example: SELECT id AS id_alias FROM test_table;
* `id_alias` is projection name of column node
*
* Calculation of projection names is done during expression nodes resolution. This is done this way
* because after identifier node is resolved we lose information about identifier name. We could
* potentially save this information in query tree node itself, but that would require to clone it in some cases.
* Example: SELECT big_scalar_subquery AS a, a AS b, b AS c;
* All 3 nodes in projection are the same big_scalar_subquery, but they have different projection names.
* If we want to save it in query tree node, we have to clone subquery node that could lead to performance degradation.
*
* Possible solution is to separate query node metadata and query node content. So only node metadata could be cloned
* if we want to change projection name. This solution does not seem to be easy for client of query tree because projection
* name will be part of interface. If we potentially could hide projection names calculation in analyzer without introducing additional
* changes in query tree structure that would be preferable.
*
* Currently each resolve method returns projection names array. Resolve method must compute projection names of node.
* If node is resolved as list node this is case for `untuple` function or `matcher` result projection names array must contain projection names
* for result nodes.
* If node is not resolved as list node, projection names array contain single projection name for node.
*
* Rules for projection names:
* 1. If node has alias. It is node projection name.
* Except scenario where `untuple` function has alias. Example: SELECT untuple(expr) AS alias, alias.
*
* 2. For constant it is constant value string representation.
*
* 3. For identifier:
* If identifier is resolved from JOIN TREE, we want to remove additional identifier qualifications.
* Example: SELECT default.test_table.id FROM test_table.
* Result projection name is `id`.
*
* Example: SELECT t1.id FROM test_table_1 AS t1, test_table_2 AS t2
* In example both test_table_1, test_table_2 have `id` column.
* In such case projection name is `t1.id` because if additional qualification is removed then column projection name `id` will be ambiguous.
*
* Example: SELECT default.test_table_1.id FROM test_table_1 AS t1, test_table_2 AS t2
* In such case projection name is `test_table_1.id` because we remove unnecessary database qualification, but table name qualification cannot be removed
* because otherwise column projection name `id` will be ambiguous.
*
* If identifier is not resolved from JOIN TREE. Identifier name is projection name.
* Except scenario where `untuple` function resolved using identifier. Example: SELECT untuple(expr) AS alias, alias.
* Example: SELECT sum(1, 1) AS value, value.
* In such case both nodes have `value` projection names.
*
* Example: SELECT id AS value, value FROM test_table.
* In such case both nodes have have `value` projection names.
*
* Special case is `untuple` function. If `untuple` function specified with alias, then result nodes will have alias.tuple_column_name projection names.
* Example: SELECT cast(tuple(1), 'Tuple(id UInt64)') AS value, untuple(value) AS a;
* Result projection names are `value`, `a.id`.
*
* If `untuple` function does not have alias then result nodes will have `tupleElement(untuple_expression_projection_name, 'tuple_column_name') projection names.
*
* Example: SELECT cast(tuple(1), 'Tuple(id UInt64)') AS value, untuple(value);
* Result projection names are `value`, `tupleElement(value, 'id')`;
*
* 4. For function:
* Projection name consists from function_name(parameters_projection_names)(arguments_projection_names).
* Additionally if function is window function. Window node projection name is used with OVER clause.
* Example: function_name (parameters_names)(argument_projection_names) OVER window_name;
* Example: function_name (parameters_names)(argument_projection_names) OVER (PARTITION BY id ORDER BY id).
* Example: function_name (parameters_names)(argument_projection_names) OVER (window_name ORDER BY id).
*
* 5. For lambda:
* If it is standalone lambda that returns single expression, function projection name is used.
* Example: WITH (x -> x + 1) AS lambda SELECT lambda(1).
* Projection name is `lambda(1)`.
*
* If is it standalone lambda that returns list, projection names of list nodes are used.
* Example: WITH (x -> *) AS lambda SELECT lambda(1) FROM test_table;
* If test_table has two columns `id`, `value`. Then result projection names are `id`, `value`.
*
* If lambda is argument of function.
* Then projection name consists from lambda(tuple(lambda_arguments)(lambda_body_projection_name));
*
* 6. For matcher:
* Matched nodes projection names are used as matcher projection names.
*
* Matched nodes must be qualified if needed.
* Example: SELECT * FROM test_table_1 AS t1, test_table_2 AS t2.
* In example table test_table_1 and test_table_2 both have `id`, `value` columns.
* Matched nodes after unqualified matcher resolve must be qualified to avoid ambiguous projection names.
* Result projection names must be `t1.id`, `t1.value`, `t2.id`, `t2.value`.
*
* There are special cases
* 1. For lambda inside APPLY matcher transformer:
* Example: SELECT * APPLY x -> toString(x) FROM test_table.
* In such case lambda argument projection name `x` will be replaced by matched node projection name.
* If table has two columns `id` and `value`. Then result projection names are `toString(id)`, `toString(value)`;
*
* 2. For unqualified matcher when JOIN tree contains JOIN with USING.
* Example: SELECT * FROM test_table_1 AS t1 INNER JOIN test_table_2 AS t2 USING(id);
* Result projection names must be `id`, `t1.value`, `t2.value`.
*
* 7. For subquery:
* For subquery projection name consists of `_subquery_` prefix and implementation specific unique number suffix.
* Example: SELECT (SELECT 1), (SELECT 1 UNION DISTINCT SELECT 1);
* Result projection name can be `_subquery_1`, `subquery_2`;
*
* 8. For table:
* Table node can be used in expression context only as right argument of IN function. In that case identifier is used
* as table node projection name.
* Example: SELECT id IN test_table FROM test_table;
* Result projection name is `in(id, test_table)`.
*/
using ProjectionName = String;
using ProjectionNames = std::vector<ProjectionName>;
constexpr auto PROJECTION_NAME_PLACEHOLDER = "__projection_name_placeholder";
struct IdentifierResolveScope
{
/// Construct identifier resolve scope using scope node, and parent scope
IdentifierResolveScope(QueryTreeNodePtr scope_node_, IdentifierResolveScope * parent_scope_)
: scope_node(std::move(scope_node_))
, parent_scope(parent_scope_)
{
if (parent_scope)
{
subquery_depth = parent_scope->subquery_depth;
context = parent_scope->context;
}
if (auto * union_node = scope_node->as<UnionNode>())
{
context = union_node->getContext();
}
else if (auto * query_node = scope_node->as<QueryNode>())
{
context = query_node->getContext();
group_by_use_nulls = context->getSettingsRef().group_by_use_nulls &&
(query_node->isGroupByWithGroupingSets() || query_node->isGroupByWithRollup() || query_node->isGroupByWithCube());
}
}
QueryTreeNodePtr scope_node;
IdentifierResolveScope * parent_scope = nullptr;
ContextPtr context;
/// Identifier lookup to result
std::unordered_map<IdentifierLookup, IdentifierResolveState, IdentifierLookupHash> identifier_lookup_to_resolve_state;
/// Lambda argument can be expression like constant, column, or it can be function
std::unordered_map<std::string, QueryTreeNodePtr> expression_argument_name_to_node;
/// Alias name to query expression node
std::unordered_map<std::string, QueryTreeNodePtr> alias_name_to_expression_node;
/// Alias name to lambda node
std::unordered_map<std::string, QueryTreeNodePtr> alias_name_to_lambda_node;
/// Alias name to table expression node
std::unordered_map<std::string, QueryTreeNodePtr> alias_name_to_table_expression_node;
/// Table column name to column node. Valid only during table ALIAS columns resolve.
ColumnNameToColumnNodeMap column_name_to_column_node;
/// CTE name to query node
std::unordered_map<std::string, QueryTreeNodePtr> cte_name_to_query_node;
/// Window name to window node
std::unordered_map<std::string, QueryTreeNodePtr> window_name_to_window_node;
/// Nodes with duplicated aliases
std::unordered_set<QueryTreeNodePtr> nodes_with_duplicated_aliases;
/// Current scope expression in resolve process stack
ExpressionsStack expressions_in_resolve_process_stack;
/// Table expressions in resolve process
std::unordered_set<const IQueryTreeNode *> table_expressions_in_resolve_process;
/// Current scope expression
std::unordered_set<IdentifierLookup, IdentifierLookupHash> non_cached_identifier_lookups_during_expression_resolve;
/// Table expression node to data
std::unordered_map<QueryTreeNodePtr, TableExpressionData> table_expression_node_to_data;
QueryTreeNodePtrWithHashSet nullable_group_by_keys;
/// Use identifier lookup to result cache
bool use_identifier_lookup_to_result_cache = true;
/// Apply nullability to aggregation keys
bool group_by_use_nulls = false;
/// JOINs count
size_t joins_count = 0;
/// Subquery depth
size_t subquery_depth = 0;
/** Scope join tree node for expression.
* Valid only during analysis construction for single expression.
*/
QueryTreeNodePtr expression_join_tree_node;
[[maybe_unused]] const IdentifierResolveScope * getNearestQueryScope() const
{
const IdentifierResolveScope * scope_to_check = this;
while (scope_to_check != nullptr)
{
if (scope_to_check->scope_node->getNodeType() == QueryTreeNodeType::QUERY)
break;
scope_to_check = scope_to_check->parent_scope;
}
return scope_to_check;
}
IdentifierResolveScope * getNearestQueryScope()
{
IdentifierResolveScope * scope_to_check = this;
while (scope_to_check != nullptr)
{
if (scope_to_check->scope_node->getNodeType() == QueryTreeNodeType::QUERY)
break;
scope_to_check = scope_to_check->parent_scope;
}
return scope_to_check;
}
TableExpressionData & getTableExpressionDataOrThrow(const QueryTreeNodePtr & table_expression_node)
{
auto it = table_expression_node_to_data.find(table_expression_node);
if (it == table_expression_node_to_data.end())
{
throw Exception(ErrorCodes::LOGICAL_ERROR,
"Table expression {} data must be initialized. In scope {}",
table_expression_node->formatASTForErrorMessage(),
scope_node->formatASTForErrorMessage());
}
return it->second;
}
const TableExpressionData & getTableExpressionDataOrThrow(const QueryTreeNodePtr & table_expression_node) const
{
auto it = table_expression_node_to_data.find(table_expression_node);
if (it == table_expression_node_to_data.end())
{
throw Exception(ErrorCodes::LOGICAL_ERROR,
"Table expression {} data must be initialized. In scope {}",
table_expression_node->formatASTForErrorMessage(),
scope_node->formatASTForErrorMessage());
}
return it->second;
}
/// Dump identifier resolve scope
[[maybe_unused]] void dump(WriteBuffer & buffer) const
{
buffer << "Scope node " << scope_node->formatASTForErrorMessage() << '\n';
buffer << "Identifier lookup to resolve state " << identifier_lookup_to_resolve_state.size() << '\n';
for (const auto & [identifier, state] : identifier_lookup_to_resolve_state)
{
buffer << "Identifier " << identifier.dump() << " resolve result ";
state.resolve_result.dump(buffer);
buffer << '\n';
}
buffer << "Expression argument name to node " << expression_argument_name_to_node.size() << '\n';
for (const auto & [alias_name, node] : expression_argument_name_to_node)
buffer << "Alias name " << alias_name << " node " << node->formatASTForErrorMessage() << '\n';
buffer << "Alias name to expression node table size " << alias_name_to_expression_node.size() << '\n';
for (const auto & [alias_name, node] : alias_name_to_expression_node)
buffer << "Alias name " << alias_name << " expression node " << node->dumpTree() << '\n';
buffer << "Alias name to function node table size " << alias_name_to_lambda_node.size() << '\n';
for (const auto & [alias_name, node] : alias_name_to_lambda_node)
buffer << "Alias name " << alias_name << " lambda node " << node->formatASTForErrorMessage() << '\n';
buffer << "Alias name to table expression node table size " << alias_name_to_table_expression_node.size() << '\n';
for (const auto & [alias_name, node] : alias_name_to_table_expression_node)
buffer << "Alias name " << alias_name << " node " << node->formatASTForErrorMessage() << '\n';
buffer << "CTE name to query node table size " << cte_name_to_query_node.size() << '\n';
for (const auto & [cte_name, node] : cte_name_to_query_node)
buffer << "CTE name " << cte_name << " node " << node->formatASTForErrorMessage() << '\n';
buffer << "WINDOW name to window node table size " << window_name_to_window_node.size() << '\n';
for (const auto & [window_name, node] : window_name_to_window_node)
buffer << "CTE name " << window_name << " node " << node->formatASTForErrorMessage() << '\n';
buffer << "Nodes with duplicated aliases size " << nodes_with_duplicated_aliases.size() << '\n';
for (const auto & node : nodes_with_duplicated_aliases)
buffer << "Alias name " << node->getAlias() << " node " << node->formatASTForErrorMessage() << '\n';
buffer << "Expression resolve process stack " << '\n';
expressions_in_resolve_process_stack.dump(buffer);
buffer << "Table expressions in resolve process size " << table_expressions_in_resolve_process.size() << '\n';
for (const auto & node : table_expressions_in_resolve_process)
buffer << "Table expression " << node->formatASTForErrorMessage() << '\n';
buffer << "Non cached identifier lookups during expression resolve " << non_cached_identifier_lookups_during_expression_resolve.size() << '\n';
for (const auto & identifier_lookup : non_cached_identifier_lookups_during_expression_resolve)
buffer << "Identifier lookup " << identifier_lookup.dump() << '\n';
buffer << "Table expression node to data " << table_expression_node_to_data.size() << '\n';
for (const auto & [table_expression_node, table_expression_data] : table_expression_node_to_data)
buffer << "Table expression node " << table_expression_node->formatASTForErrorMessage() << " data " << table_expression_data.dump() << '\n';
buffer << "Use identifier lookup to result cache " << use_identifier_lookup_to_result_cache << '\n';
buffer << "Subquery depth " << subquery_depth << '\n';
}
[[maybe_unused]] String dump() const
{
WriteBufferFromOwnString buffer;
dump(buffer);
return buffer.str();
}
};
/** Visitor that extracts expression and function aliases from node and initialize scope tables with it.
* Does not go into child lambdas and queries.
*
* Important:
* Identifier nodes with aliases are added both in alias to expression and alias to function map.
*
* These is necessary because identifier with alias can give alias name to any query tree node.
*
* Example:
* WITH (x -> x + 1) AS id, id AS value SELECT value(1);
* In this example id as value is identifier node that has alias, during scope initialization we cannot derive
* that id is actually lambda or expression.
*
* There are no easy solution here, without trying to make full featured expression resolution at this stage.
* Example:
* WITH (x -> x + 1) AS id, id AS id_1, id_1 AS id_2 SELECT id_2(1);
* Example: SELECT a, b AS a, b AS c, 1 AS c;
*
* It is client responsibility after resolving identifier node with alias, make following actions:
* 1. If identifier node was resolved in function scope, remove alias from scope expression map.
* 2. If identifier node was resolved in expression scope, remove alias from scope function map.
*
* That way we separate alias map initialization and expressions resolution.
*/
class QueryExpressionsAliasVisitor : public InDepthQueryTreeVisitor<QueryExpressionsAliasVisitor>
{
public:
explicit QueryExpressionsAliasVisitor(IdentifierResolveScope & scope_)
: scope(scope_)
{}
void visitImpl(QueryTreeNodePtr & node)
{
updateAliasesIfNeeded(node, false /*is_lambda_node*/);
}
bool needChildVisit(const QueryTreeNodePtr &, const QueryTreeNodePtr & child)
{
if (auto * lambda_node = child->as<LambdaNode>())
{
updateAliasesIfNeeded(child, true /*is_lambda_node*/);
return false;
}
else if (auto * query_tree_node = child->as<QueryNode>())
{
if (query_tree_node->isCTE())
return false;
updateAliasesIfNeeded(child, false /*is_lambda_node*/);
return false;
}
else if (auto * union_node = child->as<UnionNode>())
{
if (union_node->isCTE())
return false;
updateAliasesIfNeeded(child, false /*is_lambda_node*/);
return false;
}
return true;
}
private:
void updateAliasesIfNeeded(const QueryTreeNodePtr & node, bool is_lambda_node)
{
if (!node->hasAlias())
return;
// We should not resolve expressions to WindowNode
if (node->getNodeType() == QueryTreeNodeType::WINDOW)
return;
const auto & alias = node->getAlias();
if (is_lambda_node)
{
if (scope.alias_name_to_expression_node.contains(alias))
scope.nodes_with_duplicated_aliases.insert(node);
auto [_, inserted] = scope.alias_name_to_lambda_node.insert(std::make_pair(alias, node));
if (!inserted)
scope.nodes_with_duplicated_aliases.insert(node);
return;
}
if (scope.alias_name_to_lambda_node.contains(alias))
scope.nodes_with_duplicated_aliases.insert(node);
auto [_, inserted] = scope.alias_name_to_expression_node.insert(std::make_pair(alias, node));
if (!inserted)
scope.nodes_with_duplicated_aliases.insert(node);