Merge remote-tracking branch 'origin/main' into ES-8348-write-multipl…

…e-gaps
ywangd · Jul 16, 2024 · 52a6700 · 52a6700
2 parents 07e488a + e64aab1
commit 52a6700
Show file tree

Hide file tree

Showing 256 changed files with 6,107 additions and 3,354 deletions.
diff --git a/...ternal/src/main/java/org/elasticsearch/gradle/internal/precommit/ThirdPartyAuditTask.java b/...ternal/src/main/java/org/elasticsearch/gradle/internal/precommit/ThirdPartyAuditTask.java
@@ -193,6 +193,11 @@ public Set<String> getMissingClassExcludes() {
     @SkipWhenEmpty
     public abstract ConfigurableFileCollection getJarsToScan();
 
+    @Classpath
+    public FileCollection getClasspath() {
+        return classpath;
+    }
+
     @TaskAction
     public void runThirdPartyAudit() throws IOException {
         Set<File> jars = getJarsToScan().getFiles();

diff --git a/docs/changelog/110352.yaml b/docs/changelog/110352.yaml
diff --git a/docs/changelog/110593.yaml b/docs/changelog/110593.yaml
@@ -0,0 +1,6 @@
+pr: 110593
+summary: "[ES|QL] add tests for stats by constant"
+area: ES|QL
+type: bug
+issues:
+ - 105383
diff --git a/docs/changelog/110677.yaml b/docs/changelog/110677.yaml
@@ -0,0 +1,5 @@
+pr: 110677
+summary: Add validation for synthetic source mode in logs mode indices
+area: Logs
+type: enhancement
+issues: []
diff --git a/docs/changelog/110710.yaml b/docs/changelog/110710.yaml
@@ -0,0 +1,6 @@
+pr: 110710
+summary: Add a cluster listener to fix missing node features after upgrading from a version prior to 8.13
+area: Infra/Core
+type: bug
+issues:
+ - 109254
diff --git a/docs/changelog/110718.yaml b/docs/changelog/110718.yaml
@@ -0,0 +1,5 @@
+pr: 110718
+summary: "ESQL: Add boolean support to TOP aggregation"
+area: ES|QL
+type: feature
+issues: []
diff --git a/docs/changelog/110793.yaml b/docs/changelog/110793.yaml
@@ -0,0 +1,7 @@
+pr: 110793
+summary: Fix for union-types for multiple columns with the same name
+area: ES|QL
+type: bug
+issues:
+ - 110490
+ - 109916
diff --git a/docs/changelog/110860.yaml b/docs/changelog/110860.yaml
@@ -0,0 +1,5 @@
+pr: 110860
+summary: Speedup `CanMatchPreFilterSearchPhase` constructor
+area: Search
+type: bug
+issues: []
diff --git a/docs/reference/connector/apis/claim-connector-sync-job-api.asciidoc b/docs/reference/connector/apis/claim-connector-sync-job-api.asciidoc
@@ -0,0 +1,68 @@
+[[claim-connector-sync-job-api]]
+=== Claim connector sync job API
+++++
+<titleabbrev>Claim connector sync job</titleabbrev>
+++++
+
+preview::[]
+
+Claims a connector sync job.
+
+The `_claim` endpoint is not intended for direct connector management by users. It is there to support the implementation of services that utilize the https://github.com/elastic/connectors/blob/main/docs/CONNECTOR_PROTOCOL.md[Connector Protocol] to communicate with {es}.
+
+To get started with Connector APIs, check out the {enterprise-search-ref}/connectors-tutorial-api.html[tutorial^].
+
+[[claim-connector-sync-job-api-request]]
+==== {api-request-title}
+`PUT _connector/_sync_job/<connector_sync_job_id>/_claim`
+
+[[claim-connector-sync-job-api-prereqs]]
+==== {api-prereq-title}
+
+* To sync data using self-managed connectors, you need to deploy the {enterprise-search-ref}/build-connector.html[Elastic connector service] on your own infrastructure. This service runs automatically on Elastic Cloud for native connectors.
+* The `connector_sync_job_id` parameter should reference an existing connector sync job.
+
+[[claim-connector-sync-job-api-desc]]
+==== {api-description-title}
+
+Claims a connector sync job. This action updates the job's status to `in_progress` and sets the `last_seen` and `started_at` timestamps to the current time. Additionally, it can set the `sync_cursor` property for the sync job.
+
+[[claim-connector-sync-job-api-path-params]]
+==== {api-path-parms-title}
+
+`connector_sync_job_id`::
+(Required, string)
+
+[role="child_attributes"]
+[[claim-connector-sync-job-api-request-body]]
+==== {api-request-body-title}
+
+`worker_hostname`::
+(Required, string) The host name of the current system that will execute the job.
+
+`sync_cursor`::
+(Optional, Object) The cursor object from the last incremental sync job. This should reference the `sync_cursor` field in the connector state for which the job is executed.
+
+
+[[claim-connector-sync-job-api-response-codes]]
+==== {api-response-codes-title}
+
+`200`::
+Connector sync job was successfully claimed.
+
+`404`::
+No connector sync job matching `connector_sync_job_id` could be found.
+
+[[claim-connector-sync-job-api-example]]
+==== {api-examples-title}
+
+The following example claims the connector sync job with ID `my-connector-sync-job-id`:
+
+[source,console]
+----
+PUT _connector/_sync_job/my-connector-sync-job-id/_claim
+{
+  "worker_hostname": "some-machine"
+}
+----
+// TEST[skip:there's no way to clean up after creating a connector sync job, as we don't know the id ahead of time. Therefore, skip this test.]
diff --git a/docs/reference/connector/apis/connector-apis.asciidoc b/docs/reference/connector/apis/connector-apis.asciidoc
@@ -108,6 +108,8 @@ preview:[]
 
 * <<check-in-connector-sync-job-api>>
 preview:[]
+* <<claim-connector-sync-job-api>>
+preview:[]
 * <<set-connector-sync-job-error-api>>
 preview:[]
 * <<set-connector-sync-job-stats-api>>
@@ -141,5 +143,6 @@ include::update-connector-last-sync-api.asciidoc[]
 include::update-connector-status-api.asciidoc[]
 
 include::check-in-connector-sync-job-api.asciidoc[]
+include::claim-connector-sync-job-api.asciidoc[]
 include::set-connector-sync-job-error-api.asciidoc[]
 include::set-connector-sync-job-stats-api.asciidoc[]
diff --git a/docs/reference/esql/esql-across-clusters.asciidoc b/docs/reference/esql/esql-across-clusters.asciidoc
@@ -8,6 +8,11 @@
 
 preview::["{ccs-cap} for {esql} is in technical preview and may be changed or removed in a future release. Elastic will work to fix any issues, but features in technical preview are not subject to the support SLA of official GA features."]
 
+[NOTE]
+====
+For {ccs-cap} with {esql} on version 8.16 or later, remote clusters must also be on version 8.16 or later.
+====
+
 With {esql}, you can execute a single query across multiple clusters.
 
 [discrete]
@@ -64,7 +69,7 @@ You will need to:
 * Create an API key on the *remote cluster* using the <<security-api-create-cross-cluster-api-key,Create cross-cluster API key>> API or using the {kibana-ref}/api-keys.html[Kibana API keys UI].
 * Add the API key to the keystore on the *local cluster*, as part of the steps in <<remote-clusters-security-api-key-local-actions,configuring the local cluster>>. All cross-cluster requests from the local cluster are bound by the API key’s privileges.
 
-Using {esql} with the API key based security model requires some additional permissions that may not be needed when using the traditional query DSL based search. 
+Using {esql} with the API key based security model requires some additional permissions that may not be needed when using the traditional query DSL based search.
 The following example API call creates a role that can query remote indices using {esql} when using the API key based security model.
 
 [source,console]
@@ -73,11 +78,11 @@ POST /_security/role/remote1
 {
   "cluster": ["cross_cluster_search"], <1>
   "indices": [
-    { 
+    {
       "names" : [""], <2>
       "privileges": ["read"]
     }
-  ], 
+  ],
   "remote_indices": [ <3>
     {
       "names": [ "logs-*" ],
@@ -93,7 +98,7 @@ POST /_security/role/remote1
 <3> The indices allowed read access to the remote cluster. The configured <<security-api-create-cross-cluster-api-key,cross-cluster API key>> must also allow this index to be read.
 <4> The `read_cross_cluster` privilege is always required when using {esql} across clusters with the API key based security model.
 <5> The remote clusters to which these privileges apply.
-This remote cluster must be configured with a <<security-api-create-cross-cluster-api-key,cross-cluster API key>> and connected to the remote cluster before the remote index can be queried. 
+This remote cluster must be configured with a <<security-api-create-cross-cluster-api-key,cross-cluster API key>> and connected to the remote cluster before the remote index can be queried.
 Verify connection using the <<cluster-remote-info, Remote cluster info>> API.
 
 You will then need a user or API key with the permissions you created above. The following example API call creates a user with the `remote1` role.

diff --git a/docs/reference/esql/esql-limitations.asciidoc b/docs/reference/esql/esql-limitations.asciidoc
@@ -85,6 +85,11 @@ Some <<mapping-types,field types>> are not supported in all contexts:
 ** `cartesian_point`
 ** `cartesian_shape`
 
+In addition, when <<esql-multi-index, querying multiple indexes>>,
+it's possible for the same field to be mapped to multiple types.
+These fields cannot be directly used in queries or returned in results,
+unless they're <<esql-multi-index-union-types, explicitly converted to a single type>>.
+
 [discrete]
 [[esql-_source-availability]]
 === _source availability

diff --git a/docs/reference/esql/esql-multi-index.asciidoc b/docs/reference/esql/esql-multi-index.asciidoc
@@ -0,0 +1,175 @@
+[[esql-multi-index]]
+=== Using {esql} to query multiple indices
+++++
+<titleabbrev>Using {esql} to query multiple indices</titleabbrev>
+++++
+
+With {esql}, you can execute a single query across multiple indices, data streams, or aliases.
+To do so, use wildcards and date arithmetic. The following example uses a comma-separated list and a wildcard:
+
+[source,esql]
+----
+FROM employees-00001,other-employees-*
+----
+
+Use the format `<remote_cluster_name>:<target>` to <<esql-cross-clusters, query data streams and indices
+on remote clusters>>:
+
+[source,esql]
+----
+FROM cluster_one:employees-00001,cluster_two:other-employees-*
+----
+
+[discrete]
+[[esql-multi-index-invalid-mapping]]
+=== Field type mismatches
+
+When querying multiple indices, data streams, or aliases, you might find that the same field is mapped to multiple different types.
+For example, consider the two indices with the following field mappings:
+
+*index: events_ip*
+```
+{
+  "mappings": {
+    "properties": {
+      "@timestamp":     { "type": "date" },
+      "client_ip":      { "type": "ip" },
+      "event_duration": { "type": "long" },
+      "message":        { "type": "keyword" }
+    }
+  }
+}
+```
+
+*index: events_keyword*
+```
+{
+  "mappings": {
+    "properties": {
+      "@timestamp":     { "type": "date" },
+      "client_ip":      { "type": "keyword" },
+      "event_duration": { "type": "long" },
+      "message":        { "type": "keyword" }
+    }
+  }
+}
+```
+
+When you query each of these individually with a simple query like `FROM events_ip`, the results are provided with type-specific columns:
+
+[source.merge.styled,esql]
+----
+FROM events_ip
+| SORT @timestamp DESC
+----
+[%header.monospaced.styled,format=dsv,separator=|]
+|===
+@timestamp:date | client_ip:ip | event_duration:long | message:keyword
+2023-10-23T13:55:01.543Z | 172.21.3.15  | 1756467 | Connected to 10.1.0.1
+2023-10-23T13:53:55.832Z | 172.21.3.15  | 5033755 | Connection error
+2023-10-23T13:52:55.015Z | 172.21.3.15  | 8268153 | Connection error
+|===
+
+Note how the `client_ip` column is correctly identified as type `ip`, and all values are displayed.
+However, if instead the query sources two conflicting indices with `FROM events_*`, the type of the `client_ip` column cannot be determined
+and is reported as `unsupported` with all values returned as `null`.
+
+[[query-unsupported]]
+[source.merge.styled,esql]
+----
+FROM events_*
+| SORT @timestamp DESC
+----
+[%header.monospaced.styled,format=dsv,separator=|]
+|===
+@timestamp:date | client_ip:unsupported | event_duration:long | message:keyword
+2023-10-23T13:55:01.543Z | null  | 1756467 | Connected to 10.1.0.1
+2023-10-23T13:53:55.832Z | null  | 5033755 | Connection error
+2023-10-23T13:52:55.015Z | null  | 8268153 | Connection error
+2023-10-23T13:51:54.732Z | null  | 725448  | Connection error
+2023-10-23T13:33:34.937Z | null  | 1232382 | Disconnected
+2023-10-23T12:27:28.948Z | null  | 2764889 | Connected to 10.1.0.2
+2023-10-23T12:15:03.360Z | null  | 3450233 | Connected to 10.1.0.3
+|===
+
+In addition, if the query refers to this unsupported field directly, the query fails:
+
+[source.merge.styled,esql]
+----
+FROM events_*
+| KEEP @timestamp, client_ip, event_duration, message
+| SORT @timestamp DESC
+----
+
+[source,bash]
+----
+Cannot use field [client_ip] due to ambiguities being mapped as 
+[2] incompatible types:
+    [ip] in [events_ip],
+    [keyword] in [events_keyword]
+----
+
+[discrete]
+[[esql-multi-index-union-types]]
+=== Union types
+
+{esql} has a way to handle <<esql-multi-index-invalid-mapping, field type mismatches>>. When the same field is mapped to multiple types in multiple indices,
+the type of the field is understood to be a _union_ of the various types in the index mappings.
+As seen in the preceding examples, this _union type_ cannot be used in the results,
+and cannot be referred to by the query
+-- except when it's passed to a type conversion function that accepts all the types in the _union_ and converts the field
+to a single type. {esql} offers a suite of <<esql-type-conversion-functions,type conversion functions>> to achieve this. 
+
+In the above examples, the query can use a command like `EVAL client_ip = TO_IP(client_ip)` to resolve
+the union of `ip` and `keyword` to just `ip`.
+You can also use the type-conversion syntax `EVAL client_ip = client_ip::IP`.
+Alternatively, the query could use <<esql-to_string,`TO_STRING`>> to convert all supported types into `KEYWORD`.
+
+For example, the <<query-unsupported,query>> that returned `client_ip:unsupported` with `null` values can be improved using the `TO_IP` function or the equivalent `field::ip` syntax.
+These changes also resolve the error message.
+As long as the only reference to the original field is to pass it to a conversion function that resolves the type ambiguity, no error results.
+
+[source.merge.styled,esql]
+----
+FROM events_*
+| EVAL client_ip = TO_IP(client_ip)
+| KEEP @timestamp, client_ip, event_duration, message
+| SORT @timestamp DESC
+----
+[%header.monospaced.styled,format=dsv,separator=|]
+|===
+@timestamp:date | client_ip:ip | event_duration:long | message:keyword
+2023-10-23T13:55:01.543Z | 172.21.3.15  | 1756467 | Connected to 10.1.0.1
+2023-10-23T13:53:55.832Z | 172.21.3.15  | 5033755 | Connection error
+2023-10-23T13:52:55.015Z | 172.21.3.15  | 8268153 | Connection error
+2023-10-23T13:51:54.732Z | 172.21.3.15  | 725448  | Connection error
+2023-10-23T13:33:34.937Z | 172.21.0.5   | 1232382 | Disconnected
+2023-10-23T12:27:28.948Z | 172.21.2.113 | 2764889 | Connected to 10.1.0.2
+2023-10-23T12:15:03.360Z | 172.21.2.162 | 3450233 | Connected to 10.1.0.3
+|===
+
+[discrete]
+[[esql-multi-index-index-metadata]]
+=== Index metadata
+
+It can be helpful to know the particular index from which each row is sourced.
+To get this information, use the <<esql-metadata-fields,`METADATA`>> option on the <<esql-from,`FROM`>> command.
+
+[source.merge.styled,esql]
+----
+FROM events_* METADATA _index
+| EVAL client_ip = TO_IP(client_ip)
+| KEEP _index, @timestamp, client_ip, event_duration, message
+| SORT @timestamp DESC
+----
+[%header.monospaced.styled,format=dsv,separator=|]
+|===
+_index:keyword | @timestamp:date | client_ip:ip | event_duration:long | message:keyword
+events_ip | 2023-10-23T13:55:01.543Z | 172.21.3.15  | 1756467 | Connected to 10.1.0.1
+events_ip | 2023-10-23T13:53:55.832Z | 172.21.3.15  | 5033755 | Connection error
+events_ip | 2023-10-23T13:52:55.015Z | 172.21.3.15  | 8268153 | Connection error
+events_keyword | 2023-10-23T13:51:54.732Z | 172.21.3.15  | 725448  | Connection error
+events_keyword | 2023-10-23T13:33:34.937Z | 172.21.0.5   | 1232382 | Disconnected
+events_keyword | 2023-10-23T12:27:28.948Z | 172.21.2.113 | 2764889 | Connected to 10.1.0.2
+events_keyword | 2023-10-23T12:15:03.360Z | 172.21.2.162 | 3450233 | Connected to 10.1.0.3
+|===
diff --git a/docs/reference/esql/esql-using.asciidoc b/docs/reference/esql/esql-using.asciidoc
@@ -12,6 +12,9 @@ and set up alerts.
 Using {esql} in {elastic-sec} to investigate events in Timeline, create
 detection rules, and build {esql} queries using Elastic AI Assistant.
 
+<<esql-multi-index>>::
+Using {esql} to query multiple indexes and resolve field type mismatches.
+
 <<esql-cross-clusters>>::
 Using {esql} to query across multiple clusters.
 
@@ -21,5 +24,6 @@ Using the <<tasks,task management API>> to list and cancel {esql} queries.
 include::esql-rest.asciidoc[]
 include::esql-kibana.asciidoc[]
 include::esql-security-solution.asciidoc[]
+include::esql-multi-index.asciidoc[]
 include::esql-across-clusters.asciidoc[]
 include::task-management.asciidoc[]