Skip to content

Commit

Permalink
Merge branch 'main' into feature/binary_search_in_bkd
Browse files Browse the repository at this point in the history
* main:
  LUCENE-10421: use Constant instead of relying upon timestamp (apache#686)
  Remove TODO for LUCENE-9952 since that issue was fixed
  LUCENE-10382: Use `IndexReaderContext#id` to check reader identity. (apache#702)
  LUCENE-10408: Write doc IDs of KNN vectors as ints rather than vints. (apache#708)
  LUCENE-10439: Support multi-valued and multiple dimensions for count query in PointRangeQuery (apache#705)
  LUCENE-10417: Revert "LUCENE-10315" (apache#706)
  LUCENE-10382: Fix testSearchWithVisitedLimit failures
  LUCENE-10435: add CHANGES.txt entry (apache#704)
  LUCENE-10382: Ensure kNN filtering works with other codecs (apache#700)
  LUCENE-10054: Make sure to use Lucene90 codec in unit tests (apache#699)
  LUCENE-10435: Break loop early while checking whether DocValuesFieldExistsQuery can be rewrite to MatchAllDocsQuery (apache#701)
  LUCENE-10437:  Improve error message in the Tessellator for polygon with all points collinear (apache#703)
  LUCENE-10416: move changes entry to v10.0.0
  • Loading branch information
wjp719 committed Feb 27, 2022
2 parents 273ad41 + 466278e commit 5f640d3
Show file tree
Hide file tree
Showing 32 changed files with 327 additions and 511 deletions.
16 changes: 8 additions & 8 deletions lucene/CHANGES.txt
Expand Up @@ -24,7 +24,9 @@ New Features

Improvements
---------------------
(No changes)

* LUCENE-10416: Update Korean Dictionary to mecab-ko-dic-2.1.1-20180720 for Nori.
(Uihyun Kim)

Optimizations
---------------------
Expand Down Expand Up @@ -189,9 +191,6 @@ Improvements
* LUCENE-10371: Make IndexRearranger able to arrange segment in a determined order.
(Patrick Zhai)

* LUCENE-10416: Update Korean Dictionary to mecab-ko-dic-2.1.1-20180720 for Nori.
(Uihyun Kim)

Optimizations
---------------------

Expand Down Expand Up @@ -222,8 +221,6 @@ Optimizations

* LUCENE-10388: Remove MultiLevelSkipListReader#SkipBuffer to make JVM less confused. (Guo Feng)

* LUCENE-10315: Use SIMD instructions to decode BKD doc IDs. (Guo Feng, Adrien Grand, Ignacio Vera)

* LUCENE-10367: Optimize CoveringQuery for the case when the minimum number of
matching clauses is a constant. (LuYunCheng via Adrien Grand)

Expand All @@ -234,8 +231,8 @@ Optimizations

* LUCENE-10424 Optimize the "everything matches" case for count query in PointRangeQuery. (Ignacio Vera, Lu Xugang)

* LUCENE-10084: Rewrite DocValuesFieldExistsQuery to MatchAllDocsQuery whenever terms
or points have a docCount that is equal to maxDoc. (Vigya Sharma)
* LUCENE-10084, LUCENE-10435: Rewrite DocValuesFieldExistsQuery to MatchAllDocsQuery whenever
terms or points have a docCount that is equal to maxDoc. (Vigya Sharma, Lu Xugang)

Changes in runtime behavior
---------------------
Expand Down Expand Up @@ -302,6 +299,9 @@ Other

* LUCENE-10413: Make Ukrainian default stop words list available as a public getter. (Alan Woodward)

* LUCENE-10437: Polygon tessellator throws a more informative error message when the provided polygon
does not contain enough no-collinear points. (Ignacio Vera)

======================= Lucene 9.0.0 =======================

New Features
Expand Down
Expand Up @@ -164,7 +164,7 @@ public final PointsFormat pointsFormat() {
}

@Override
public final KnnVectorsFormat knnVectorsFormat() {
public KnnVectorsFormat knnVectorsFormat() {
return knnVectorsFormat;
}

Expand Down
Expand Up @@ -38,7 +38,7 @@
public final class Lucene90HnswGraphBuilder {

/** Default random seed for level generation * */
private static final long DEFAULT_RAND_SEED = System.currentTimeMillis();
private static final long DEFAULT_RAND_SEED = 42;
/** A name for the HNSW component for the info-stream * */
public static final String HNSW_COMPONENT = "HNSW";

Expand Down Expand Up @@ -144,7 +144,15 @@ void addGraphNode(float[] value) throws IOException {
// We pass 'null' for acceptOrds because there are no deletions while building the graph
NeighborQueue candidates =
Lucene90OnHeapHnswGraph.search(
value, beamWidth, beamWidth, vectorValues, similarityFunction, hnsw, null, random);
value,
beamWidth,
beamWidth,
vectorValues,
similarityFunction,
hnsw,
null,
Integer.MAX_VALUE,
random);

int node = hnsw.addNode();

Expand Down
Expand Up @@ -252,6 +252,7 @@ public TopDocs search(String field, float[] target, int k, Bits acceptDocs, int
fieldEntry.similarityFunction,
getGraphValues(fieldEntry),
getAcceptOrds(acceptDocs, fieldEntry),
visitedLimit,
random);
int i = 0;
ScoreDoc[] scoreDocs = new ScoreDoc[Math.min(results.size(), k)];
Expand All @@ -261,11 +262,11 @@ public TopDocs search(String field, float[] target, int k, Bits acceptDocs, int
results.pop();
scoreDocs[scoreDocs.length - ++i] = new ScoreDoc(fieldEntry.ordToDoc[node], score);
}
// always return >= the case where we can assert == is only when there are fewer than topK
// vectors in the index
return new TopDocs(
new TotalHits(results.visitedCount(), TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO),
scoreDocs);
TotalHits.Relation relation =
results.incomplete()
? TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO
: TotalHits.Relation.EQUAL_TO;
return new TopDocs(new TotalHits(results.visitedCount(), relation), scoreDocs);
}

private OffHeapVectorValues getOffHeapVectorValues(FieldEntry fieldEntry) throws IOException {
Expand Down
Expand Up @@ -80,6 +80,7 @@ public static NeighborQueue search(
VectorSimilarityFunction similarityFunction,
HnswGraph graphValues,
Bits acceptOrds,
int visitedLimit,
SplittableRandom random)
throws IOException {
int size = graphValues.size();
Expand All @@ -89,19 +90,25 @@ public static NeighborQueue search(
// MAX heap, from which to pull the candidate nodes
NeighborQueue candidates = new NeighborQueue(numSeed, !similarityFunction.reversed);

int numVisited = 0;
// set of ordinals that have been visited by search on this layer, used to avoid backtracking
SparseFixedBitSet visited = new SparseFixedBitSet(size);
// get initial candidates at random
int boundedNumSeed = Math.min(numSeed, 2 * size);
for (int i = 0; i < boundedNumSeed; i++) {
int entryPoint = random.nextInt(size);
if (visited.getAndSet(entryPoint) == false) {
if (numVisited >= visitedLimit) {
results.markIncomplete();
break;
}
// explore the topK starting points of some random numSeed probes
float score = similarityFunction.compare(query, vectors.vectorValue(entryPoint));
candidates.add(entryPoint, score);
if (acceptOrds == null || acceptOrds.get(entryPoint)) {
results.add(entryPoint, score);
}
numVisited++;
}
}

Expand All @@ -110,7 +117,7 @@ public static NeighborQueue search(
// to exceed this bound
BoundsChecker bound = BoundsChecker.create(similarityFunction.reversed);
bound.set(results.topScore());
while (candidates.size() > 0) {
while (candidates.size() > 0 && results.incomplete() == false) {
// get the best candidate (closest or best scoring)
float topCandidateScore = candidates.topScore();
if (results.size() >= topK) {
Expand All @@ -127,6 +134,11 @@ public static NeighborQueue search(
continue;
}

if (numVisited >= visitedLimit) {
results.markIncomplete();
break;
}

float score = similarityFunction.compare(query, vectors.vectorValue(friendOrd));
if (results.size() < numSeed || bound.check(score) == false) {
candidates.add(friendOrd, score);
Expand All @@ -135,12 +147,13 @@ public static NeighborQueue search(
bound.set(results.topScore());
}
}
numVisited++;
}
}
while (results.size() > topK) {
results.pop();
}
results.setVisitedCount(visited.approximateCardinality());
results.setVisitedCount(numVisited);
return results;
}

Expand Down
Expand Up @@ -80,14 +80,6 @@ public void readLongs(long[] dst, int offset, int length) throws IOException {
}
}

@Override
public void readInts(int[] dst, int offset, int length) throws IOException {
in.readInts(dst, offset, length);
for (int i = 0; i < length; ++i) {
dst[offset + i] = Integer.reverseBytes(dst[offset + i]);
}
}

@Override
public void readFloats(float[] dst, int offset, int length) throws IOException {
in.readFloats(dst, offset, length);
Expand Down
@@ -0,0 +1,44 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.backward_codecs.lucene90;

import org.apache.lucene.codecs.KnnVectorsFormat;
import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat;

public class Lucene90RWCodec extends Lucene90Codec {

private final KnnVectorsFormat defaultKnnVectorsFormat;
private final KnnVectorsFormat knnVectorsFormat =
new PerFieldKnnVectorsFormat() {
@Override
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
return defaultKnnVectorsFormat;
}
};

public Lucene90RWCodec() {
this.defaultKnnVectorsFormat =
new Lucene90RWHnswVectorsFormat(
Lucene90HnswVectorsFormat.DEFAULT_MAX_CONN,
Lucene90HnswVectorsFormat.DEFAULT_BEAM_WIDTH);
}

@Override
public KnnVectorsFormat knnVectorsFormat() {
return knnVectorsFormat;
}
}
Expand Up @@ -23,12 +23,11 @@
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.KnnVectorsFormat;
import org.apache.lucene.tests.index.BaseKnnVectorsFormatTestCase;
import org.apache.lucene.tests.util.TestUtil;

public class TestLucene90HnswVectorsFormat extends BaseKnnVectorsFormatTestCase {
@Override
protected Codec getCodec() {
return TestUtil.getDefaultCodec();
return new Lucene90RWCodec();
}

public void testToString() {
Expand Down
Expand Up @@ -154,20 +154,31 @@ public TopDocs search(String field, float[] target, int k, Bits acceptDocs, int
FieldInfo info = readState.fieldInfos.fieldInfo(field);
VectorSimilarityFunction vectorSimilarity = info.getVectorSimilarityFunction();
HitQueue topK = new HitQueue(k, false);

int numVisited = 0;
TotalHits.Relation relation = TotalHits.Relation.EQUAL_TO;

int doc;
while ((doc = values.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
if (acceptDocs != null && acceptDocs.get(doc) == false) {
continue;
}

if (numVisited >= visitedLimit) {
relation = TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO;
break;
}

float[] vector = values.vectorValue();
float score = vectorSimilarity.convertToScore(vectorSimilarity.compare(vector, target));
topK.insertWithOverflow(new ScoreDoc(doc, score));
numVisited++;
}
ScoreDoc[] topScoreDocs = new ScoreDoc[topK.size()];
for (int i = topScoreDocs.length - 1; i >= 0; i--) {
topScoreDocs[i] = topK.pop();
}
return new TopDocs(new TotalHits(values.size(), TotalHits.Relation.EQUAL_TO), topScoreDocs);
return new TopDocs(new TotalHits(numVisited, relation), topScoreDocs);
}

@Override
Expand Down
Expand Up @@ -69,13 +69,13 @@
* <li><b>[int]</b> the number of documents having values for this field
* <li><b>[int8]</b> if equals to -1, dense – all documents have values for a field. If equals to
* 0, sparse – some documents missing values.
* <li><b>array[vint]</b> for sparse case, the docids of documents having vectors, in order
* <li><b>array[int]</b> for sparse case, the docids of documents having vectors, in order
* <li><b>[int]</b> the maximum number of connections (neigbours) that each node can have
* <li><b>[int]</b> number of levels in the graph
* <li>Graph nodes by level. For each level
* <ul>
* <li><b>[int]</b> the number of nodes on this level
* <li><b>array[vint]</b> for levels greater than 0 list of nodes on this level, stored as
* <li><b>array[int]</b> for levels greater than 0 list of nodes on this level, stored as
* the the level 0th nodes ordinals.
* </ul>
* </ul>
Expand Down
Expand Up @@ -347,7 +347,7 @@ private static class FieldEntry {
// as not all docs have vector values, fill a mapping from dense vector ordinals to docIds
ordToDoc = new int[size];
for (int i = 0; i < size; i++) {
int doc = input.readVInt();
int doc = input.readInt();
ordToDoc[i] = doc;
}
}
Expand All @@ -366,7 +366,7 @@ private static class FieldEntry {
} else {
nodesByLevel[level] = new int[numNodesOnLevel];
for (int i = 0; i < numNodesOnLevel; i++) {
nodesByLevel[level][i] = input.readVInt();
nodesByLevel[level][i] = input.readInt();
}
}
}
Expand Down
Expand Up @@ -213,7 +213,7 @@ private void writeMeta(
meta.writeByte((byte) 0); // sparse marker, some documents don't have vector values
DocIdSetIterator iter = docsWithField.iterator();
for (int doc = iter.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = iter.nextDoc()) {
meta.writeVInt(doc);
meta.writeInt(doc);
}
}

Expand All @@ -229,7 +229,7 @@ private void writeMeta(
if (level > 0) {
while (nodesOnLevel.hasNext()) {
int node = nodesOnLevel.nextInt();
meta.writeVInt(node); // list of nodes on a level
meta.writeInt(node); // list of nodes on a level
}
}
}
Expand Down
6 changes: 6 additions & 0 deletions lucene/core/src/java/org/apache/lucene/geo/Tessellator.java
Expand Up @@ -102,6 +102,9 @@ public static List<Triangle> tessellate(final Polygon polygon, boolean checkSelf
if (outerNode == null) {
throw new IllegalArgumentException("Malformed shape detected in Tessellator!");
}
if (outerNode == outerNode.next || outerNode == outerNode.next.next) {
throw new IllegalArgumentException("at least three non-collinear points required");
}

// Determine if the specified list of points contains holes
if (polygon.numHoles() > 0) {
Expand Down Expand Up @@ -154,6 +157,9 @@ public static List<Triangle> tessellate(final XYPolygon polygon, boolean checkSe
if (outerNode == null) {
throw new IllegalArgumentException("Malformed shape detected in Tessellator!");
}
if (outerNode == outerNode.next || outerNode == outerNode.next.next) {
throw new IllegalArgumentException("at least three non-collinear points required");
}

// Determine if the specified list of points contains holes
if (polygon.numHoles() > 0) {
Expand Down
Expand Up @@ -69,17 +69,18 @@ public void visit(QueryVisitor visitor) {

@Override
public Query rewrite(IndexReader reader) throws IOException {
int rewritableReaders = 0;
boolean allReadersRewritable = true;
for (LeafReaderContext context : reader.leaves()) {
LeafReader leaf = context.reader();
Terms terms = leaf.terms(field);
PointValues pointValues = leaf.getPointValues(field);
if ((terms != null && terms.getDocCount() == leaf.maxDoc())
|| (pointValues != null && pointValues.getDocCount() == leaf.maxDoc())) {
rewritableReaders++;
if ((terms == null || terms.getDocCount() != leaf.maxDoc())
&& (pointValues == null || pointValues.getDocCount() != leaf.maxDoc())) {
allReadersRewritable = false;
break;
}
}
if (rewritableReaders == reader.leaves().size()) {
if (allReadersRewritable) {
return new MatchAllDocsQuery();
}
return super.rewrite(reader);
Expand Down

0 comments on commit 5f640d3

Please sign in to comment.