Merge branch 'main' into feature/binary_search_in_bkd

* main: LUCENE-10421: use Constant instead of relying upon timestamp (apache#686) Remove TODO for LUCENE-9952 since that issue was fixed LUCENE-10382: Use `IndexReaderContext#id` to check reader identity. (apache#702) LUCENE-10408: Write doc IDs of KNN vectors as ints rather than vints. (apache#708) LUCENE-10439: Support multi-valued and multiple dimensions for count query in PointRangeQuery (apache#705) LUCENE-10417: Revert "LUCENE-10315" (apache#706) LUCENE-10382: Fix testSearchWithVisitedLimit failures LUCENE-10435: add CHANGES.txt entry (apache#704) LUCENE-10382: Ensure kNN filtering works with other codecs (apache#700) LUCENE-10054: Make sure to use Lucene90 codec in unit tests (apache#699) LUCENE-10435: Break loop early while checking whether DocValuesFieldExistsQuery can be rewrite to MatchAllDocsQuery (apache#701) LUCENE-10437: Improve error message in the Tessellator for polygon with all points collinear (apache#703) LUCENE-10416: move changes entry to v10.0.0
wjp719 · Feb 27, 2022 · 5f640d3 · 5f640d3
2 parents 273ad41 + 466278e
commit 5f640d3
Show file tree

Hide file tree

Showing 32 changed files with 327 additions and 511 deletions.
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
@@ -24,7 +24,9 @@ New Features
 
 Improvements
 ---------------------
-(No changes)
+
+* LUCENE-10416: Update Korean Dictionary to mecab-ko-dic-2.1.1-20180720 for Nori.
+  (Uihyun Kim)
 
 Optimizations
 ---------------------
@@ -189,9 +191,6 @@ Improvements
 * LUCENE-10371: Make IndexRearranger able to arrange segment in a determined order.
   (Patrick Zhai)
 
-* LUCENE-10416: Update Korean Dictionary to mecab-ko-dic-2.1.1-20180720 for Nori.
-  (Uihyun Kim)
-
 Optimizations
 ---------------------
 
@@ -222,8 +221,6 @@ Optimizations
 
 * LUCENE-10388: Remove MultiLevelSkipListReader#SkipBuffer to make JVM less confused. (Guo Feng)
 
-* LUCENE-10315: Use SIMD instructions to decode BKD doc IDs. (Guo Feng, Adrien Grand, Ignacio Vera)
-
 * LUCENE-10367: Optimize CoveringQuery for the case when the minimum number of
   matching clauses is a constant. (LuYunCheng via Adrien Grand)
 
@@ -234,8 +231,8 @@ Optimizations
 
 * LUCENE-10424 Optimize the "everything matches" case for count query in PointRangeQuery. (Ignacio Vera, Lu Xugang)
 
-* LUCENE-10084: Rewrite DocValuesFieldExistsQuery to MatchAllDocsQuery whenever terms
-  or points have a docCount that is equal to maxDoc. (Vigya Sharma)
+* LUCENE-10084, LUCENE-10435: Rewrite DocValuesFieldExistsQuery to MatchAllDocsQuery whenever
+  terms or points have a docCount that is equal to maxDoc. (Vigya Sharma, Lu Xugang)
 
 Changes in runtime behavior
 ---------------------
@@ -302,6 +299,9 @@ Other
 
 * LUCENE-10413: Make Ukrainian default stop words list available as a public getter. (Alan Woodward)
 
+* LUCENE-10437: Polygon tessellator throws a more informative error message when the provided polygon 
+  does not contain enough no-collinear points. (Ignacio Vera)
+
 ======================= Lucene 9.0.0 =======================
 
 New Features

diff --git a/...ne/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90Codec.java b/...ne/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90Codec.java
@@ -164,7 +164,7 @@ public final PointsFormat pointsFormat() {
   }
 
   @Override
-  public final KnnVectorsFormat knnVectorsFormat() {
+  public KnnVectorsFormat knnVectorsFormat() {
     return knnVectorsFormat;
   }
 

diff --git a/...-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswGraphBuilder.java b/...-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswGraphBuilder.java
@@ -38,7 +38,7 @@
 public final class Lucene90HnswGraphBuilder {
 
   /** Default random seed for level generation * */
-  private static final long DEFAULT_RAND_SEED = System.currentTimeMillis();
+  private static final long DEFAULT_RAND_SEED = 42;
   /** A name for the HNSW component for the info-stream * */
   public static final String HNSW_COMPONENT = "HNSW";
 
@@ -144,7 +144,15 @@ void addGraphNode(float[] value) throws IOException {
     // We pass 'null' for acceptOrds because there are no deletions while building the graph
     NeighborQueue candidates =
         Lucene90OnHeapHnswGraph.search(
-            value, beamWidth, beamWidth, vectorValues, similarityFunction, hnsw, null, random);
+            value,
+            beamWidth,
+            beamWidth,
+            vectorValues,
+            similarityFunction,
+            hnsw,
+            null,
+            Integer.MAX_VALUE,
+            random);
 
     int node = hnsw.addNode();
 

diff --git a/...codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswVectorsReader.java b/...codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswVectorsReader.java
@@ -252,6 +252,7 @@ public TopDocs search(String field, float[] target, int k, Bits acceptDocs, int
             fieldEntry.similarityFunction,
             getGraphValues(fieldEntry),
             getAcceptOrds(acceptDocs, fieldEntry),
+            visitedLimit,
             random);
     int i = 0;
     ScoreDoc[] scoreDocs = new ScoreDoc[Math.min(results.size(), k)];
@@ -261,11 +262,11 @@ public TopDocs search(String field, float[] target, int k, Bits acceptDocs, int
       results.pop();
       scoreDocs[scoreDocs.length - ++i] = new ScoreDoc(fieldEntry.ordToDoc[node], score);
     }
-    // always return >= the case where we can assert == is only when there are fewer than topK
-    // vectors in the index
-    return new TopDocs(
-        new TotalHits(results.visitedCount(), TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO),
-        scoreDocs);
+    TotalHits.Relation relation =
+        results.incomplete()
+            ? TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO
+            : TotalHits.Relation.EQUAL_TO;
+    return new TopDocs(new TotalHits(results.visitedCount(), relation), scoreDocs);
   }
 
   private OffHeapVectorValues getOffHeapVectorValues(FieldEntry fieldEntry) throws IOException {

diff --git a/...d-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90OnHeapHnswGraph.java b/...d-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90OnHeapHnswGraph.java
@@ -80,6 +80,7 @@ public static NeighborQueue search(
       VectorSimilarityFunction similarityFunction,
       HnswGraph graphValues,
       Bits acceptOrds,
+      int visitedLimit,
       SplittableRandom random)
       throws IOException {
     int size = graphValues.size();
@@ -89,19 +90,25 @@ public static NeighborQueue search(
     // MAX heap, from which to pull the candidate nodes
     NeighborQueue candidates = new NeighborQueue(numSeed, !similarityFunction.reversed);
 
+    int numVisited = 0;
     // set of ordinals that have been visited by search on this layer, used to avoid backtracking
     SparseFixedBitSet visited = new SparseFixedBitSet(size);
     // get initial candidates at random
     int boundedNumSeed = Math.min(numSeed, 2 * size);
     for (int i = 0; i < boundedNumSeed; i++) {
       int entryPoint = random.nextInt(size);
       if (visited.getAndSet(entryPoint) == false) {
+        if (numVisited >= visitedLimit) {
+          results.markIncomplete();
+          break;
+        }
         // explore the topK starting points of some random numSeed probes
         float score = similarityFunction.compare(query, vectors.vectorValue(entryPoint));
         candidates.add(entryPoint, score);
         if (acceptOrds == null || acceptOrds.get(entryPoint)) {
           results.add(entryPoint, score);
         }
+        numVisited++;
       }
     }
 
@@ -110,7 +117,7 @@ public static NeighborQueue search(
     // to exceed this bound
     BoundsChecker bound = BoundsChecker.create(similarityFunction.reversed);
     bound.set(results.topScore());
-    while (candidates.size() > 0) {
+    while (candidates.size() > 0 && results.incomplete() == false) {
       // get the best candidate (closest or best scoring)
       float topCandidateScore = candidates.topScore();
       if (results.size() >= topK) {
@@ -127,6 +134,11 @@ public static NeighborQueue search(
           continue;
         }
 
+        if (numVisited >= visitedLimit) {
+          results.markIncomplete();
+          break;
+        }
+
         float score = similarityFunction.compare(query, vectors.vectorValue(friendOrd));
         if (results.size() < numSeed || bound.check(score) == false) {
           candidates.add(friendOrd, score);
@@ -135,12 +147,13 @@ public static NeighborQueue search(
             bound.set(results.topScore());
           }
         }
+        numVisited++;
       }
     }
     while (results.size() > topK) {
       results.pop();
     }
-    results.setVisitedCount(visited.approximateCardinality());
+    results.setVisitedCount(numVisited);
     return results;
   }
 

diff --git a/...codecs/src/java/org/apache/lucene/backward_codecs/store/EndiannessReverserIndexInput.java b/...codecs/src/java/org/apache/lucene/backward_codecs/store/EndiannessReverserIndexInput.java
@@ -80,14 +80,6 @@ public void readLongs(long[] dst, int offset, int length) throws IOException {
     }
   }
 
-  @Override
-  public void readInts(int[] dst, int offset, int length) throws IOException {
-    in.readInts(dst, offset, length);
-    for (int i = 0; i < length; ++i) {
-      dst[offset + i] = Integer.reverseBytes(dst[offset + i]);
-    }
-  }
-
   @Override
   public void readFloats(float[] dst, int offset, int length) throws IOException {
     in.readFloats(dst, offset, length);

diff --git a/.../backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/Lucene90RWCodec.java b/.../backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/Lucene90RWCodec.java
@@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.backward_codecs.lucene90;
+
+import org.apache.lucene.codecs.KnnVectorsFormat;
+import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat;
+
+public class Lucene90RWCodec extends Lucene90Codec {
+
+  private final KnnVectorsFormat defaultKnnVectorsFormat;
+  private final KnnVectorsFormat knnVectorsFormat =
+      new PerFieldKnnVectorsFormat() {
+        @Override
+        public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
+          return defaultKnnVectorsFormat;
+        }
+      };
+
+  public Lucene90RWCodec() {
+    this.defaultKnnVectorsFormat =
+        new Lucene90RWHnswVectorsFormat(
+            Lucene90HnswVectorsFormat.DEFAULT_MAX_CONN,
+            Lucene90HnswVectorsFormat.DEFAULT_BEAM_WIDTH);
+  }
+
+  @Override
+  public KnnVectorsFormat knnVectorsFormat() {
+    return knnVectorsFormat;
+  }
+}
diff --git a/...cs/src/test/org/apache/lucene/backward_codecs/lucene90/TestLucene90HnswVectorsFormat.java b/...cs/src/test/org/apache/lucene/backward_codecs/lucene90/TestLucene90HnswVectorsFormat.java
@@ -23,12 +23,11 @@
 import org.apache.lucene.codecs.Codec;
 import org.apache.lucene.codecs.KnnVectorsFormat;
 import org.apache.lucene.tests.index.BaseKnnVectorsFormatTestCase;
-import org.apache.lucene.tests.util.TestUtil;
 
 public class TestLucene90HnswVectorsFormat extends BaseKnnVectorsFormatTestCase {
   @Override
   protected Codec getCodec() {
-    return TestUtil.getDefaultCodec();
+    return new Lucene90RWCodec();
   }
 
   public void testToString() {

diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextKnnVectorsReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextKnnVectorsReader.java
@@ -154,20 +154,31 @@ public TopDocs search(String field, float[] target, int k, Bits acceptDocs, int
     FieldInfo info = readState.fieldInfos.fieldInfo(field);
     VectorSimilarityFunction vectorSimilarity = info.getVectorSimilarityFunction();
     HitQueue topK = new HitQueue(k, false);
+
+    int numVisited = 0;
+    TotalHits.Relation relation = TotalHits.Relation.EQUAL_TO;
+
     int doc;
     while ((doc = values.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
       if (acceptDocs != null && acceptDocs.get(doc) == false) {
         continue;
       }
+
+      if (numVisited >= visitedLimit) {
+        relation = TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO;
+        break;
+      }
+
       float[] vector = values.vectorValue();
       float score = vectorSimilarity.convertToScore(vectorSimilarity.compare(vector, target));
       topK.insertWithOverflow(new ScoreDoc(doc, score));
+      numVisited++;
     }
     ScoreDoc[] topScoreDocs = new ScoreDoc[topK.size()];
     for (int i = topScoreDocs.length - 1; i >= 0; i--) {
       topScoreDocs[i] = topK.pop();
     }
-    return new TopDocs(new TotalHits(values.size(), TotalHits.Relation.EQUAL_TO), topScoreDocs);
+    return new TopDocs(new TotalHits(numVisited, relation), topScoreDocs);
   }
 
   @Override

diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene91/Lucene91HnswVectorsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene91/Lucene91HnswVectorsFormat.java
@@ -69,13 +69,13 @@
  *   <li><b>[int]</b> the number of documents having values for this field
  *   <li><b>[int8]</b> if equals to -1, dense – all documents have values for a field. If equals to
  *       0, sparse – some documents missing values.
- *   <li><b>array[vint]</b> for sparse case, the docids of documents having vectors, in order
+ *   <li><b>array[int]</b> for sparse case, the docids of documents having vectors, in order
  *   <li><b>[int]</b> the maximum number of connections (neigbours) that each node can have
  *   <li><b>[int]</b> number of levels in the graph
  *   <li>Graph nodes by level. For each level
  *       <ul>
  *         <li><b>[int]</b> the number of nodes on this level
- *         <li><b>array[vint]</b> for levels greater than 0 list of nodes on this level, stored as
+ *         <li><b>array[int]</b> for levels greater than 0 list of nodes on this level, stored as
  *             the the level 0th nodes ordinals.
  *       </ul>
  * </ul>

diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene91/Lucene91HnswVectorsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene91/Lucene91HnswVectorsReader.java
@@ -347,7 +347,7 @@ private static class FieldEntry {
         // as not all docs have vector values, fill a mapping from dense vector ordinals to docIds
         ordToDoc = new int[size];
         for (int i = 0; i < size; i++) {
-          int doc = input.readVInt();
+          int doc = input.readInt();
           ordToDoc[i] = doc;
         }
       }
@@ -366,7 +366,7 @@ private static class FieldEntry {
         } else {
           nodesByLevel[level] = new int[numNodesOnLevel];
           for (int i = 0; i < numNodesOnLevel; i++) {
-            nodesByLevel[level][i] = input.readVInt();
+            nodesByLevel[level][i] = input.readInt();
           }
         }
       }

diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene91/Lucene91HnswVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene91/Lucene91HnswVectorsWriter.java
@@ -213,7 +213,7 @@ private void writeMeta(
       meta.writeByte((byte) 0); // sparse marker, some documents don't have vector values
       DocIdSetIterator iter = docsWithField.iterator();
       for (int doc = iter.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = iter.nextDoc()) {
-        meta.writeVInt(doc);
+        meta.writeInt(doc);
       }
     }
 
@@ -229,7 +229,7 @@ private void writeMeta(
         if (level > 0) {
           while (nodesOnLevel.hasNext()) {
             int node = nodesOnLevel.nextInt();
-            meta.writeVInt(node); // list of nodes on a level
+            meta.writeInt(node); // list of nodes on a level
           }
         }
       }

diff --git a/lucene/core/src/java/org/apache/lucene/geo/Tessellator.java b/lucene/core/src/java/org/apache/lucene/geo/Tessellator.java
@@ -102,6 +102,9 @@ public static List<Triangle> tessellate(final Polygon polygon, boolean checkSelf
     if (outerNode == null) {
       throw new IllegalArgumentException("Malformed shape detected in Tessellator!");
     }
+    if (outerNode == outerNode.next || outerNode == outerNode.next.next) {
+      throw new IllegalArgumentException("at least three non-collinear points required");
+    }
 
     // Determine if the specified list of points contains holes
     if (polygon.numHoles() > 0) {
@@ -154,6 +157,9 @@ public static List<Triangle> tessellate(final XYPolygon polygon, boolean checkSe
     if (outerNode == null) {
       throw new IllegalArgumentException("Malformed shape detected in Tessellator!");
     }
+    if (outerNode == outerNode.next || outerNode == outerNode.next.next) {
+      throw new IllegalArgumentException("at least three non-collinear points required");
+    }
 
     // Determine if the specified list of points contains holes
     if (polygon.numHoles() > 0) {

diff --git a/lucene/core/src/java/org/apache/lucene/search/DocValuesFieldExistsQuery.java b/lucene/core/src/java/org/apache/lucene/search/DocValuesFieldExistsQuery.java
@@ -69,17 +69,18 @@ public void visit(QueryVisitor visitor) {
 
   @Override
   public Query rewrite(IndexReader reader) throws IOException {
-    int rewritableReaders = 0;
+    boolean allReadersRewritable = true;
     for (LeafReaderContext context : reader.leaves()) {
       LeafReader leaf = context.reader();
       Terms terms = leaf.terms(field);
       PointValues pointValues = leaf.getPointValues(field);
-      if ((terms != null && terms.getDocCount() == leaf.maxDoc())
-          || (pointValues != null && pointValues.getDocCount() == leaf.maxDoc())) {
-        rewritableReaders++;
+      if ((terms == null || terms.getDocCount() != leaf.maxDoc())
+          && (pointValues == null || pointValues.getDocCount() != leaf.maxDoc())) {
+        allReadersRewritable = false;
+        break;
       }
     }
-    if (rewritableReaders == reader.leaves().size()) {
+    if (allReadersRewritable) {
       return new MatchAllDocsQuery();
     }
     return super.rewrite(reader);