Distance measures for dense and sparse vectors (elastic#37947)

* Distance measures for dense and sparse vectors Introduce painless functions of cosineSimilarity and dotProduct distance measures for dense and sparse vector fields. ```js { "query": { "script_score": { "query": { "match_all": {} }, "script": { "source": "cosineSimilarity(params.queryVector, doc['my_dense_vector'].value)", "params": { "queryVector": [4, 3.4, -1.2] } } } } } ``` ```js { "query": { "script_score": { "query": { "match_all": {} }, "script": { "source": "cosineSimilaritySparse(params.queryVector, doc['my_sparse_vector'].value)", "params": { "queryVector": {"2": -0.5, "10" : 111.3, "50": -13.0, "113": 14.8, "4545": -156.0} } } } } } ``` Closes elastic#31615
weizijun · Feb 20, 2019 · b017f3d · b017f3d
1 parent ac34b47
commit b017f3d
Show file tree

Hide file tree

Showing 22 changed files with 1,339 additions and 74 deletions.
diff --git a/docs/reference/mapping/types/dense-vector.asciidoc b/docs/reference/mapping/types/dense-vector.asciidoc
@@ -9,7 +9,7 @@ not exceed 500. The number of dimensions can be
 different across documents. A `dense_vector` field is
 a single-valued field.
 
-These vectors can be used for document scoring.
+These vectors can be used for <<vector-functions,document scoring>>.
 For example, a document score can represent a distance between
 a given query vector and the indexed document vector.
 

diff --git a/docs/reference/mapping/types/sparse-vector.asciidoc b/docs/reference/mapping/types/sparse-vector.asciidoc
@@ -9,7 +9,7 @@ not exceed 500. The number of dimensions can be
 different across documents. A `sparse_vector` field is
 a single-valued field.
 
-These vectors can be used for document scoring.
+These vectors can be used for <<vector-functions,document scoring>>.
 For example, a document score can represent a distance between
 a given query vector and the indexed document vector.
 

diff --git a/docs/reference/query-dsl/script-score-query.asciidoc b/docs/reference/query-dsl/script-score-query.asciidoc
@@ -74,6 +74,113 @@ to be the most efficient by using the internal mechanisms.
 --------------------------------------------------
 // NOTCONSOLE
 
+[[vector-functions]]
+===== Functions for vector fields
+These functions are used for
+for <<dense-vector,`dense_vector`>>  and
+<<sparse-vector,`sparse_vector`>> fields.
+
+For dense_vector fields, `cosineSimilarity` calculates the measure of
+cosine similarity between a given query vector and document vectors.
+
+[source,js]
+--------------------------------------------------
+{
+  "query": {
+    "script_score": {
+      "query": {
+        "match_all": {}
+      },
+      "script": {
+        "source": "cosineSimilarity(params.queryVector, doc['my_dense_vector'])",
+        "params": {
+          "queryVector": [4, 3.4, -0.2]  <1>
+        }
+      }
+    }
+  }
+}
+--------------------------------------------------
+// NOTCONSOLE
+<1> To take advantage of the script optimizations, provide a query vector as a script parameter.
+
+Similarly, for sparse_vector fields, `cosineSimilaritySparse` calculates cosine similarity
+between a given query vector and document vectors.
+
+[source,js]
+--------------------------------------------------
+{
+  "query": {
+    "script_score": {
+      "query": {
+        "match_all": {}
+      },
+      "script": {
+        "source": "cosineSimilaritySparse(params.queryVector, doc['my_sparse_vector'])",
+        "params": {
+          "queryVector": {"2": 0.5, "10" : 111.3, "50": -1.3, "113": 14.8, "4545": 156.0}
+        }
+      }
+    }
+  }
+}
+--------------------------------------------------
+// NOTCONSOLE
+
+For dense_vector fields, `dotProduct` calculates the measure of
+dot product between a given query vector and document vectors.
+
+[source,js]
+--------------------------------------------------
+{
+  "query": {
+    "script_score": {
+      "query": {
+        "match_all": {}
+      },
+      "script": {
+        "source": "dotProduct(params.queryVector, doc['my_dense_vector'])",
+        "params": {
+          "queryVector": [4, 3.4, -0.2]
+        }
+      }
+    }
+  }
+}
+--------------------------------------------------
+// NOTCONSOLE
+
+Similarly, for sparse_vector fields, `dotProductSparse` calculates dot product
+between a given query vector and document vectors.
+
+[source,js]
+--------------------------------------------------
+{
+  "query": {
+    "script_score": {
+      "query": {
+        "match_all": {}
+      },
+      "script": {
+        "source": "dotProductSparse(params.queryVector, doc['my_sparse_vector'])",
+        "params": {
+          "queryVector": {"2": 0.5, "10" : 111.3, "50": -1.3, "113": 14.8, "4545": 156.0}
+        }
+      }
+    }
+  }
+}
+--------------------------------------------------
+// NOTCONSOLE
+
+NOTE: If a document doesn't have a value for a vector field on which
+a vector function is executed, 0 is returned as a result
+for this document.
+
+NOTE: If a document's dense vector field has a number of dimensions
+different from the query's vector, 0 is used for missing dimensions
+in the calculations of vector functions.
+
 
 [[random-functions]]
 ===== Random functions

diff --git a/modules/mapper-extras/build.gradle b/modules/mapper-extras/build.gradle
@@ -20,4 +20,13 @@
 esplugin {
     description 'Adds advanced field mappers'
     classname 'org.elasticsearch.index.mapper.MapperExtrasPlugin'
+    extendedPlugins = ['lang-painless']
 }
+
+dependencies {
+    compileOnly project(':modules:lang-painless')
+}
+
+integTestCluster {
+    module project(':modules:lang-painless')
+}
diff --git a/...es/mapper-extras/src/main/java/org/elasticsearch/index/mapper/DenseVectorFieldMapper.java b/...es/mapper-extras/src/main/java/org/elasticsearch/index/mapper/DenseVectorFieldMapper.java
@@ -30,6 +30,7 @@
 import org.elasticsearch.common.xcontent.XContentParser.Token;
 import org.elasticsearch.index.fielddata.IndexFieldData;
 import org.elasticsearch.index.query.QueryShardContext;
+import org.elasticsearch.index.query.VectorDVIndexFieldData;
 import org.elasticsearch.search.DocValueFormat;
 
 import java.io.IOException;
@@ -119,8 +120,7 @@ public Query existsQuery(QueryShardContext context) {
 
         @Override
         public IndexFieldData.Builder fielddataBuilder(String fullyQualifiedIndexName) {
-            throw new UnsupportedOperationException(
-                "Field [" + name() + "] of type [" + typeName() + "] doesn't support sorting, scripting or aggregating");
+            return new VectorDVIndexFieldData.Builder(true);
         }
 
         @Override

diff --git a/...s/mapper-extras/src/main/java/org/elasticsearch/index/mapper/SparseVectorFieldMapper.java b/...s/mapper-extras/src/main/java/org/elasticsearch/index/mapper/SparseVectorFieldMapper.java
@@ -30,6 +30,7 @@
 import org.elasticsearch.common.xcontent.XContentParser.Token;
 import org.elasticsearch.index.fielddata.IndexFieldData;
 import org.elasticsearch.index.query.QueryShardContext;
+import org.elasticsearch.index.query.VectorDVIndexFieldData;
 import org.elasticsearch.search.DocValueFormat;
 
 import java.io.IOException;
@@ -119,8 +120,7 @@ public Query existsQuery(QueryShardContext context) {
 
         @Override
         public IndexFieldData.Builder fielddataBuilder(String fullyQualifiedIndexName) {
-            throw new UnsupportedOperationException(
-                "Field [" + name() + "] of type [" + typeName() + "] doesn't support sorting, scripting or aggregating");
+            return new VectorDVIndexFieldData.Builder(false);
         }
 
         @Override

diff --git a/modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/VectorEncoderDecoder.java b/modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/VectorEncoderDecoder.java
@@ -23,7 +23,7 @@
 import org.apache.lucene.util.InPlaceMergeSorter;
 
 // static utility functions for encoding and decoding dense_vector and sparse_vector fields
-final class VectorEncoderDecoder {
+public final class VectorEncoderDecoder {
     static final byte INT_BYTES = 4;
     static final byte SHORT_BYTES = 2;
 
@@ -34,10 +34,11 @@ private VectorEncoderDecoder() { }
      * BytesRef: int[] floats encoded as integers values, 2 bytes for each dimension
      * @param values - values of the sparse array
      * @param dims - dims of the sparse array
-     * @param dimCount - number of the dimension
+     * @param dimCount - number of the dimensions, necessary as values and dims are dynamically created arrays,
+     *          and may be over-allocated
      * @return BytesRef
      */
-    static BytesRef encodeSparseVector(int[] dims, float[] values, int dimCount) {
+    public static BytesRef encodeSparseVector(int[] dims, float[] values, int dimCount) {
         // 1. Sort dims and values
         sortSparseDimsValues(dims, values, dimCount);
         byte[] buf = new byte[dimCount * (INT_BYTES + SHORT_BYTES)];
@@ -66,9 +67,12 @@ static BytesRef encodeSparseVector(int[] dims, float[] values, int dimCount) {
 
     /**
      * Decodes the first part of BytesRef into sparse vector dimensions
-     * @param vectorBR - vector decoded in BytesRef
+     * @param vectorBR - sparse vector encoded in BytesRef
      */
-    static int[] decodeSparseVectorDims(BytesRef vectorBR) {
+    public static int[] decodeSparseVectorDims(BytesRef vectorBR) {
+        if (vectorBR == null) {
+            throw new IllegalArgumentException("A document doesn't have a value for a vector field!");
+        }
         int dimCount = vectorBR.length / (INT_BYTES + SHORT_BYTES);
         int[] dims = new int[dimCount];
         int offset = vectorBR.offset;
@@ -81,9 +85,12 @@ static int[] decodeSparseVectorDims(BytesRef vectorBR) {
 
     /**
      * Decodes the second part of the BytesRef into sparse vector values
-     * @param vectorBR - vector decoded in BytesRef
+     * @param vectorBR - sparse vector encoded in BytesRef
      */
-    static float[] decodeSparseVector(BytesRef vectorBR) {
+    public static float[] decodeSparseVector(BytesRef vectorBR) {
+        if (vectorBR == null) {
+            throw new IllegalArgumentException("A document doesn't have a value for a vector field!");
+        }
         int dimCount = vectorBR.length / (INT_BYTES + SHORT_BYTES);
         int offset =  vectorBR.offset + SHORT_BYTES * dimCount; //calculate the offset from where values are encoded
         float[] vector = new float[dimCount];
@@ -100,10 +107,14 @@ static float[] decodeSparseVector(BytesRef vectorBR) {
 
 
     /**
-    Sort dimensions in the ascending order and
-    sort values in the same order as their corresponding dimensions
-    **/
-    static void sortSparseDimsValues(int[] dims, float[] values, int n) {
+     * Sorts dimensions in the ascending order and
+     * sorts values in the same order as their corresponding dimensions
+     *
+     * @param dims - dimensions of the sparse query vector
+     * @param values - values for the sparse query vector
+     * @param n - number of dimensions
+     */
+    public static void sortSparseDimsValues(int[] dims, float[] values, int n) {
         new InPlaceMergeSorter() {
             @Override
             public int compare(int i, int j) {
@@ -123,8 +134,42 @@ public void swap(int i, int j) {
         }.sort(0, n);
     }
 
-    // Decodes a BytesRef into an array of floats
-    static float[] decodeDenseVector(BytesRef vectorBR) {
+    /**
+     * Sorts dimensions in the ascending order and
+     * sorts values in the same order as their corresponding dimensions
+     *
+     * @param dims - dimensions of the sparse query vector
+     * @param values - values for the sparse query vector
+     * @param n - number of dimensions
+     */
+    public static void sortSparseDimsDoubleValues(int[] dims, double[] values, int n) {
+        new InPlaceMergeSorter() {
+            @Override
+            public int compare(int i, int j) {
+                return Integer.compare(dims[i], dims[j]);
+            }
+
+            @Override
+            public void swap(int i, int j) {
+                int tempDim = dims[i];
+                dims[i] = dims[j];
+                dims[j] = tempDim;
+
+                double tempValue = values[j];
+                values[j] = values[i];
+                values[i] = tempValue;
+            }
+        }.sort(0, n);
+    }
+
+    /**
+     * Decodes a BytesRef into an array of floats
+     * @param vectorBR - dense vector encoded in BytesRef
+     */
+    public static float[] decodeDenseVector(BytesRef vectorBR) {
+        if (vectorBR == null) {
+            throw new IllegalArgumentException("A document doesn't have a value for a vector field!");
+        }
         int dimCount = vectorBR.length / INT_BYTES;
         float[] vector = new float[dimCount];
         int offset = vectorBR.offset;

diff --git a/...apper-extras/src/main/java/org/elasticsearch/index/query/DocValuesWhitelistExtension.java b/...apper-extras/src/main/java/org/elasticsearch/index/query/DocValuesWhitelistExtension.java
@@ -0,0 +1,42 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.query;
+
+
+import org.elasticsearch.painless.spi.PainlessExtension;
+import org.elasticsearch.painless.spi.Whitelist;
+import org.elasticsearch.painless.spi.WhitelistLoader;
+import org.elasticsearch.script.ScoreScript;
+import org.elasticsearch.script.ScriptContext;
+
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+
+public class DocValuesWhitelistExtension implements PainlessExtension {
+
+    private static final Whitelist WHITELIST =
+        WhitelistLoader.loadFromResourceFiles(DocValuesWhitelistExtension.class, "docvalues_whitelist.txt");
+
+    @Override
+    public Map<ScriptContext<?>, List<Whitelist>> getContextWhitelists() {
+        return Collections.singletonMap(ScoreScript.CONTEXT, Collections.singletonList(WHITELIST));
+    }
+}