Skip to content

ES|QL - Allow full text functions to be used in STATS ... WHERE #125479

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
5f4eff5
Add full text functions usage in STATS.. WHERE
carlosdelest Mar 24, 2025
748e576
Fix test
carlosdelest Mar 24, 2025
dc1dbe6
Skip rest compat test
carlosdelest Mar 24, 2025
3117d50
Fix test error message
carlosdelest Mar 24, 2025
3a753d2
Remove match operator check
carlosdelest Mar 26, 2025
2a8e987
Included ShardContext list into LocalExecutionPlannerContext
carlosdelest Mar 26, 2025
c99d373
Update docs/changelog/125479.yaml
carlosdelest Mar 26, 2025
2cc6898
Merge remote-tracking branch 'origin/main' into enhancement/esql-text…
carlosdelest Mar 26, 2025
425f146
Merge remote-tracking branch 'carlosdelest/enhancement/esql-text-sear…
carlosdelest Mar 26, 2025
c842f14
Add shard context information to aggregate filters
carlosdelest Mar 26, 2025
8a0f48f
Merge branch 'main' into enhancement/esql-text-search-functions-stats
carlosdelest Apr 28, 2025
dfce035
Allow FTFs in WHERE clauses but not in grouping clauses
carlosdelest May 5, 2025
3963fd3
Add CSV tests for STATS
carlosdelest May 5, 2025
85a2374
Add stats scores test
carlosdelest May 5, 2025
4f3228a
Merge remote-tracking branch 'origin/main' into enhancement/esql-text…
carlosdelest May 5, 2025
351d1ff
Fix capabilities
carlosdelest May 5, 2025
52dd7b9
Fix test
carlosdelest May 5, 2025
02f8748
Changed error message
carlosdelest May 5, 2025
f9bf170
[CI] Auto commit changes from spotless
elasticsearchmachine May 5, 2025
e98ad43
Forbid usage of _score aggregations on STATS when there is a WHERE cl…
carlosdelest May 9, 2025
40eaeb8
Merge remote-tracking branch 'origin/main' into enhancement/esql-text…
carlosdelest May 9, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions docs/changelog/125479.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
pr: 125479
summary: ES|QL - Allow full text functions to be used in STATS
area: ES|QL
type: enhancement
issues:
- 125481
1 change: 1 addition & 0 deletions x-pack/plugin/build.gradle
Original file line number Diff line number Diff line change
@@ -95,6 +95,7 @@ tasks.named("yamlRestCompatTestTransform").configure({ task ->
task.skipTest("esql/61_enrich_ip/Invalid IP strings", "We switched from exceptions to null+warnings for ENRICH runtime errors")
task.skipTest("esql/180_match_operator/match with non text field", "Match operator can now be used on non-text fields")
task.skipTest("esql/180_match_operator/match with functions", "Error message changed")
task.skipTest("esql/180_match_operator/match within eval", "Error message changed")
task.skipTest("esql/40_unsupported_types/semantic_text declared in mapping", "The semantic text field format changed")
task.skipTest("esql/190_lookup_join/Alias as lookup index", "LOOKUP JOIN does not support index aliases for now")
task.skipTest("esql/190_lookup_join/alias-repeated-alias", "LOOKUP JOIN does not support index aliases for now")
Original file line number Diff line number Diff line change
@@ -189,3 +189,82 @@ book_no:keyword
7140
2714
;

testKqlInStatsNonPushable
required_capability: kql_function
required_capability: full_text_functions_in_stats_where

from books
| where length(title) > 40
| stats c = count(*) where kql("title:Lord")
;

c:long
3
;


testMatchInStatsPushableAndNonPushable
required_capability: kql_function
required_capability: full_text_functions_in_stats_where

from books
| stats c = count(*) where (kql("title: lord") and ratings > 4.5) or (kql("author: dostoevsky") and length(title) > 50)
;

c:long
6
;

testKqlInStatsPushable
required_capability: kql_function
required_capability: full_text_functions_in_stats_where

from books
| stats c = count(*) where kql("author:tolkien")
;

c:long
22
;

testKqlInStatsWithNonPushableDisjunctions
required_capability: kql_function
required_capability: full_text_functions_in_stats_where
FROM books
| STATS c = count(*) where kql("title: lord") or length(title) > 130
;

c:long
5
;

testKqlInStatsWithMultipleAggs
required_capability: kql_function
required_capability: full_text_functions_in_stats_where
FROM books
| STATS c = count(*) where kql("title: lord"), m = max(book_no::integer) where kql("author: tolkien"), n = min(book_no::integer) where kql("author: dostoevsky")
;

c:long | m:integer | n:integer
4 | 9607 | 1211
;


testKqlInStatsWithGrouping
required_capability: kql_function
required_capability: full_text_functions_in_stats_where
FROM books
| STATS r = AVG(ratings) where kql("title: Lord AND Rings") by author | WHERE r is not null
;
ignoreOrder: true

r:double | author: text
4.75 | Alan Lee
4.674999952316284 | J. R. R. Tolkien
4.670000076293945 | John Ronald Reuel Tolkien
4.670000076293945 | Agnes Perkins
4.670000076293945 | Charles Adolph Huttar
4.670000076293945 | Walter Scheps
4.559999942779541 | J.R.R. Tolkien
;
Original file line number Diff line number Diff line change
@@ -750,3 +750,94 @@ book_no:keyword
7140
2714
;

testMatchInStatsNonPushable
required_capability: match_function
required_capability: full_text_functions_in_stats_where

from books
| where length(title) > 40
| stats c = count(*) where match(title, "Lord")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here are some suggestions on the test coverages that I can think of, these also apply to the other full text functions and operators:

  1. Can we have some tests with a bit more complicated predicates in the where clause under stats? For example some combinations of and, or and not, with functions that can or cannot be pushed down to Lucent? Perhaps borrow some queries from the existing tests where the where clause is not under stats.
  2. I wonder how score works with where under stats, can we have some tests to capture how it behaves?
  3. Can we have some tests to cover multiple aggregate functions with where clause under stats? The predicates for each aggregation can have some overlaps, we have some optimization rules to deal with overlapped predicates under stats.
  4. Can we have some test with aggregation and grouping(BY) with full text functions under where clause?
  5. Add some full text functions with options for the completeness.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we have some tests with a bit more complicated predicates in the where clause under stats? For example some combinations of and, or and not, with functions that can or cannot be pushed down to Lucent? Perhaps borrow some queries from the existing tests where the where clause is not under stats.
Can we have some tests to cover multiple aggregate functions with where clause under stats? The predicates for each aggregation can have some overlaps, we have some optimization rules to deal with overlapped predicates under stats.
Can we have some test with aggregation and grouping(BY) with full text functions under where clause?
Add some full text functions with options for the completeness.

I added testing in 3963fd3, hopefully that works!

I wonder how score works with where under stats, can we have some tests to capture how it behaves?

It affects scoring as well. I'm thinking that

FROM my_index METADATA _score
| WHERE match(field, "query")
| STATS c = AVG(_score)

is the same as

FROM my_index METADATA _score
| STATS c = AVG(_score) WHERE match(field, "query")

so using a FTF in a STATS WHERE clause affects scoring as well. I think it's better for consistency, but happy to discuss with the team.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It affects scoring as well. I'm thinking that

FROM my_index METADATA _score
| WHERE match(field, "query")
| STATS c = AVG(_score)

is the same as

FROM my_index METADATA _score
| STATS c = AVG(_score) WHERE match(field, "query")

so using a FTF in a STATS WHERE clause affects scoring as well. I think it's better for consistency, but happy to discuss with the team.

The example above makes sense to me. I wonder how the _score works, when there are multiple aggregate functions in a stats command? For example, does the query below make sense? avg and max's scores seem to come from different sources.

FROM my_index METADATA _score
| STATS avg = AVG(_score) WHERE match(field, "query1"), max = max(_score) WHERE match(field, "query2")

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder how the _score works, when there are multiple aggregate functions in a stats command?

@fang-xing-esql good catch. That doesn't work, as the filters are not pushed down to Lucene and thus there's no scoring available 😓

It could be doable to calculate the scoring on the individual WHERE clauses in the aggregation, but I'm inclined not to do it. We're looking into having separate scores for individual queries as a separate effort, and this kind of replicates some of those efforts.

What I've done is to disallow the usage of _score aggregations in STATS when that includes WHERE clauses. This allows to do things like:

from books metadata _score 
| where match(title, "Lord Rings", {"operator": "AND"})
| stats avg_score = avg(_score), max_score = max(_score), min_score = min(_score)

but does not allow to use WHERE in STATS:

from books metadata _score 
| stats avg_score = avg(_score) where match(title, "Lord Rings", {"operator": "AND"})

Change done in e98ad43

I think this is a good compromise, and we can work to lift that limitation later if needed be.

LMKWYT

;

c:long
3
;

testMatchInStatsPushableAndNonPushable
required_capability: match_function
required_capability: full_text_functions_in_stats_where

from books
| stats c = count(*) where (match(title, "lord") and ratings > 4.5) or (match(author, "dostoevsky") and length(title) > 50)
;

c:long
6
;

testMatchInStatsPushable
required_capability: match_function
required_capability: full_text_functions_in_stats_where

from books
| stats c = count(*) where match(author, "tolkien")
;

c:long
22
;

testMatchInStatsWithOptions
required_capability: match_function
required_capability: full_text_functions_in_stats_where

FROM books
| STATS c = count(*) where match(title, "Hobbit Back Again", {"operator": "AND"})
;

c:long
1
;

testMatchInStatsWithNonPushableDisjunctions
required_capability: match_function
required_capability: full_text_functions_in_stats_where

FROM books
| STATS c = count(*) where match(title, "lord") or length(title) > 130
;

c:long
5
;

testMatchInStatsWithMultipleAggs
required_capability: match_function
required_capability: full_text_functions_in_stats_where
FROM books
| STATS c = count(*) where match(title, "lord"), m = max(book_no::integer) where match(author, "tolkien"), n = min(book_no::integer) where match(author, "dostoevsky")
;

c:long | m:integer | n:integer
4 | 9607 | 1211
;


testMatchInStatsWithGrouping
required_capability: match_function
required_capability: full_text_functions_in_stats_where
FROM books
| STATS r = AVG(ratings) where match(title, "Lord Rings", {"operator": "AND"}) by author | WHERE r is not null
;
ignoreOrder: true

r:double | author: text
4.75 | Alan Lee
4.674999952316284 | J. R. R. Tolkien
4.670000076293945 | John Ronald Reuel Tolkien
4.670000076293945 | Agnes Perkins
4.670000076293945 | Charles Adolph Huttar
4.670000076293945 | Walter Scheps
4.559999942779541 | J.R.R. Tolkien
;
Original file line number Diff line number Diff line change
@@ -751,3 +751,29 @@ from semantic_text
host:keyword | semantic_text_field:text | language_name:keyword | language_code:integer
"host1" | live long and prosper | English | 1
;


testMatchInStatsNonPushable
required_capability: match_operator_colon
required_capability: full_text_functions_in_stats_where

from books
| where length(title) > 40
| stats c = count(*) where title:"Lord"
;

c:long
3
;

testMatchInStatsPushable
required_capability: match_operator_colon
required_capability: full_text_functions_in_stats_where

from books
| stats c = count(*) where author:"tolkien"
;

c:long
22
;
Original file line number Diff line number Diff line change
@@ -210,3 +210,92 @@ book_no:keyword | title:text
7480 | The Hobbit
// end::qstr-with-options-result[]
;

testQstrInStatsNonPushable
required_capability: qstr_function
required_capability: full_text_functions_in_stats_where

from books
| where length(title) > 40
| stats c = count(*) where qstr("title:Lord")
;

c:long
3
;

testMatchInStatsPushableAndNonPushable
required_capability: qstr_function
required_capability: full_text_functions_in_stats_where

from books
| stats c = count(*) where (qstr("title: lord") and ratings > 4.5) or (qstr("author: dostoevsky") and length(title) > 50)
;

c:long
6
;

testQstrInStatsPushable
required_capability: qstr_function
required_capability: full_text_functions_in_stats_where

from books
| stats c = count(*) where qstr("author:tolkien")
;

c:long
22
;

testQstrInStatsWithOptions
required_capability: qstr_function
required_capability: full_text_functions_in_stats_where

FROM books
| STATS c = count(*) where qstr("title: Hobbit Back Again", {"default_operator": "AND"})
;

c:long
1
;

testQstrInStatsWithNonPushableDisjunctions
required_capability: qstr_function
required_capability: full_text_functions_in_stats_where
FROM books
| STATS c = count(*) where qstr("title: lord") or length(title) > 130
;

c:long
5
;

testQstrInStatsWithMultipleAggs
required_capability: qstr_function
required_capability: full_text_functions_in_stats_where
FROM books
| STATS c = count(*) where qstr("title: lord"), m = max(book_no::integer) where qstr("author: tolkien"), n = min(book_no::integer) where qstr("author: dostoevsky")
;

c:long | m:integer | n:integer
4 | 9607 | 1211
;

testQstrInStatsWithGrouping
required_capability: qstr_function
required_capability: full_text_functions_in_stats_where
FROM books
| STATS r = AVG(ratings) where qstr("title: Lord Rings", {"default_operator": "AND"}) by author | WHERE r is not null
;
ignoreOrder: true

r:double | author: text
4.75 | Alan Lee
4.674999952316284 | J. R. R. Tolkien
4.670000076293945 | John Ronald Reuel Tolkien
4.670000076293945 | Agnes Perkins
4.670000076293945 | Charles Adolph Huttar
4.670000076293945 | Walter Scheps
4.559999942779541 | J.R.R. Tolkien
;
Original file line number Diff line number Diff line change
@@ -521,7 +521,6 @@ book_no:keyword | _score:double
8678 | 0.0
;


disjunctionScoresMultipleClauses

required_capability: metadata_score
@@ -544,3 +543,18 @@ book_no:keyword | _score:double
4023 | 1.5062403678894043
2924 | 1.2732219696044922
;

statsScores

required_capability: metadata_score
required_capability: match_function
required_capability: full_text_functions_in_stats_where

from books metadata _score
| where match(title, "Lord Rings", {"operator": "AND"})
| stats avg_score = avg(_score), max_score = max(_score), min_score = min(_score)
;

avg_score:double | max_score:double | min_score:double
3.869828939437866 | 5.123856544494629 | 3.0124807357788086
;
Original file line number Diff line number Diff line change
@@ -66,7 +66,7 @@ public void testKqlQueryWithinEval() {
""";

var error = expectThrows(VerificationException.class, () -> run(query));
assertThat(error.getMessage(), containsString("[KQL] function is only supported in WHERE commands"));
assertThat(error.getMessage(), containsString("[KQL] function is only supported in WHERE and STATS commands"));
}

public void testInvalidKqlQueryEof() {
Loading
Oops, something went wrong.
Loading
Oops, something went wrong.