Permalink
Browse files

Improved search navigators counters accuracy and consistency.

- added some missing increments from RWI results
- decrement relevant navigator counts when solr or RWI results are
evicted because duplicates detection or constraints checked belatedly
- do not compute facets when unnecessary to avoid unwanted CPU load
- do not increment from facets when already done
- do not rely on facets on remote solr peers requests, as most of the
time only a limited part of their total results if fetched (thus also
preventing unnecessary load on remote peers)
- use a concurrency friendly score map for the dates navigators to
prevent unwanted ConcurrentModificationExceptions

This improves the situation for the most obvious inconsistencies in
search navigators counts, but more has to be done for a true accuracy
(notably when query modifiers constraints are applied belatedly - after
the solr or RWI retrieval request - such as the content domain
constraint)
  • Loading branch information...
luccioman committed Sep 6, 2017
1 parent ba0ba75 commit 5d3ceb31b79418b7133fc5bc570781b259a67944
@@ -246,7 +246,7 @@ public static serverObjects respond(final RequestHeader header, final serverObje
prop.put("nav-dates", 0);
} else {
prop.put("nav-dates", 1);
navigatorIterator = theSearch.dateNavigator.iterator(); // this iterator is different as it iterates by the key order (which is a date order)
navigatorIterator = theSearch.dateNavigator.keysByNaturalOrder(true); // this iterator is different as it iterates by the key order (which is a date order)
int i = 0, pos = 0, neg = 0;
long dx = -1;
Date fromconstraint = theSearch.getQuery().modifier.from == null ? null : DateDetection.parseLine(theSearch.getQuery().modifier.from, theSearch.getQuery().timezoneOffset);
@@ -257,6 +257,9 @@ public static serverObjects respond(final RequestHeader header, final serverObje
name = navigatorIterator.next().trim();
if (name.length() < 10) continue;
count = theSearch.dateNavigator.get(name);
if(count == 0) {
continue;
}
String shortname = name.substring(0, 10);
long d = Instant.parse(name).toEpochMilli();
Date dd = new Date(d);
@@ -29,19 +29,21 @@
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import javax.servlet.http.HttpServletResponse;
import org.apache.solr.common.SolrDocument;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.solr.SchemaConfiguration;
import net.yacy.cora.federate.solr.SchemaDeclaration;
import net.yacy.cora.federate.solr.SolrType;
import net.yacy.cora.sorting.ReversibleScoreMap;
import net.yacy.cora.storage.Configuration;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.search.Switchboard;
import net.yacy.search.query.SearchEvent;
import net.yacy.search.schema.CollectionSchema;
import org.apache.solr.common.SolrDocument;
/**
* Base implementation class for Federated Search Connectors providing the basic
@@ -115,8 +117,7 @@ public void run() {
if (doclist != null) {
ConcurrentLog.info("YACY SEARCH (federated)", "Got " + doclist.size() + " documents from " + instancename);
Map<String, LinkedHashSet<String>> snippets = new HashMap<String, LinkedHashSet<String>>(); // add nodes doesn't allow null
Map<String, ReversibleScoreMap<String>> facets = new HashMap<String, ReversibleScoreMap<String>>(); // add nodes doesn't allow null
theSearch.addNodes(doclist, facets, snippets, false, instancename, doclist.size());
theSearch.addNodes(doclist, null, snippets, false, instancename, doclist.size(), true);
for (URIMetadataNode doc : doclist) {
theSearch.addHeuristic(doc.hash(), instancename, false);
@@ -25,12 +25,14 @@
package net.yacy.cora.sorting;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicInteger;
@@ -246,6 +248,24 @@ public String toString() {
for (int i = l.size() - 1; i >= 0; i--) r.add(l.get(i));
return r.iterator();
}
/**
* Creates and returns a sorted view of the keys, sorted by their own natural order.
* @param up true = asc order, false = reverse order
* @return iterator accessing the keys in natural order
*/
public Iterator<E> keysByNaturalOrder(final boolean up) {
TreeSet<E> sortedKeys;
if(up) {
sortedKeys = new TreeSet<>();
} else {
sortedKeys = new TreeSet<>(Collections.reverseOrder());
}
for(E key : this.map.keySet()) {
sortedKeys.add(key);
}
return sortedKeys.iterator();
}
public static void main(final String[] args) {
final ConcurrentScoreMap<String> a = new ConcurrentScoreMap<String>();
@@ -154,7 +154,7 @@ public WordReferenceVars(final WordReference e, boolean local) {
/**
* initializer for special poison object
*/
public WordReferenceVars() {
private WordReferenceVars() {
this.flags = null;
this.lastModified = 0;
this.language = null;
@@ -210,6 +210,13 @@ public Bitfield flags() {
public byte[] getLanguage() {
return ASCII.getBytes(this.language);
}
/**
* @return the ISO 639 language code of the reference
*/
public String getLanguageString() {
return this.language;
}
@Override
public char getType() {
@@ -257,11 +257,14 @@ public static final String country(String code) {
}
/**
* see if the given country in alpha-2 country code exists
* @param code, the mnemonic of the country in alpha-2
* @return true if the code exists
* Check if the given country in alpha-2 country code is supported.
* @param code, the mnemonic of the country in alpha-2 (ISO 639-1)
* @return true if the code is not null and is known by this YaCy server
*/
public static final boolean exists(String code) {
if(code == null) {
return false;
}
return mapping.containsKey(code.toLowerCase(Locale.ROOT));
}
@@ -746,8 +746,7 @@ private static void remoteSearchProcess(
} else {
// feed results as nodes (SolrQuery results) which carry metadata,
// to prevent a call to getMetaData for RWI results, which would fail (if no metadata in index and no display of these results)
Map<String, ReversibleScoreMap<String>> facets = new HashMap<String, ReversibleScoreMap<String>>();
event.addNodes(storeDocs, facets, snip, false, target.getName() + "/" + target.hash, count);
event.addNodes(storeDocs, null, snip, false, target.getName() + "/" + target.hash, count, true);
}
event.addFinalize();
event.addExpectedRemoteReferences(-count);
@@ -1110,6 +1109,8 @@ protected synchronized void close() {
* @param target target peer to query. May be null : in that case, local peer is queried.
* @param partitions
* @param blacklist url list to exclude from results
* @param useSolrFacets when true, use Solr computed facets when possible to update the event navigators counters
* @param incrementNavigators when true, increment event navigators either with facet counts or with individual results
* @return the size of results list
* @throws InterruptedException when interrupt status on calling thread is detected while processing
*/
@@ -1120,7 +1121,9 @@ protected static int solrQuery(
final int count,
final Seed target,
final int partitions,
final Blacklist blacklist) throws InterruptedException {
final Blacklist blacklist,
final boolean useSolrFacets,
final boolean incrementNavigators) throws InterruptedException {
//try {System.out.println("*** debug-query *** " + URLDecoder.decode(solrQuery.toString(), "UTF-8"));} catch (UnsupportedEncodingException e) {}
@@ -1205,18 +1208,28 @@ protected static int solrQuery(
}
// evaluate facets
for (String field: event.query.facetfields) {
FacetField facet = rsp[0].getFacetField(field);
ReversibleScoreMap<String> result = new ClusteredScoreMap<String>(UTF8.insensitiveUTF8Comparator);
List<Count> values = facet == null ? null : facet.getValues();
if (values == null) continue;
for (Count ff: values) {
int c = (int) ff.getCount();
if (c == 0) continue;
if (ff.getName().length() == 0) continue; // facet entry without text is not useful
result.set(ff.getName(), c);
}
if (result.size() > 0) facets.put(field, result);
if(useSolrFacets) {
for (String field: event.query.facetfields) {
FacetField facet = rsp[0].getFacetField(field);
ReversibleScoreMap<String> result = new ClusteredScoreMap<String>(UTF8.insensitiveUTF8Comparator);
List<Count> values = facet == null ? null : facet.getValues();
if (values == null) {
continue;
}
for (Count ff: values) {
int c = (int) ff.getCount();
if (c == 0) {
continue;
}
if (ff.getName().length() == 0) {
continue; // facet entry without text is not useful
}
result.set(ff.getName(), c);
}
if (result.size() > 0) {
facets.put(field, result);
}
}
}
// evaluate snippets
@@ -1331,7 +1344,7 @@ protected static int solrQuery(
docList[0].clear();
docList[0] = null;
if (localsearch) {
event.addNodes(resultContainer, facets, snippets, true, "localpeer", numFound);
event.addNodes(resultContainer, facets, snippets, true, "localpeer", numFound, incrementNavigators);
event.addFinalize();
event.addExpectedRemoteReferences(-count);
Network.log.info("local search (solr): localpeer sent " + resultContainer.size() + "/" + numFound + " references");
@@ -1347,7 +1360,7 @@ protected static int solrQuery(
docs); // will clear docs on return
writeToLocalIndexThread.start();
}
event.addNodes(resultContainer, facets, snippets, false, target.getName() + "/" + target.hash, numFound);
event.addNodes(resultContainer, facets, snippets, false, target.getName() + "/" + target.hash, numFound, incrementNavigators);
event.addFinalize();
event.addExpectedRemoteReferences(-count);
Network.log.info("remote search (solr): peer " + target.getName() + " sent " + (resultContainer.size()) + "/" + numFound + " references");
@@ -252,19 +252,26 @@ public static void primaryRemoteSearches(
log.info("preparing remote search: shortmem=" + (shortmem ? "true" : "false") + ", indexingQueueSize=" + indexingQueueSize +
", redundancy=" + redundancy + ", minage=" + minage + ", dhtPeers=" + dhtPeers.size() + ", robinsonpeers=" + robinsonPeers.size() + ", health: " + (healthMessage.length() > 0 ? healthMessage.substring(2) : "perfect"));
/* Computing Solr facets is not relevant for remote Solr results and adds unnecessary CPU load on remote peers :
* facets count the total number of matching results per facet field, but we only fetch here at most 'count' results. The remaining part
* is not to be retrieved from remote peers even if making a new request filtering on one of these fields,
* as there is no insurance the same remote peers would be selected. What's more, remote results can contain many
* duplicates that would be filtered when adding them to the event node stack.
*/
final boolean useFacets = false;
// start solr searches
final int targets = dhtPeers.size() + robinsonPeers.size();
if (!sb.getConfigBool(SwitchboardConstants.DEBUG_SEARCH_REMOTE_SOLR_OFF, false)) {
final SolrQuery solrQuery = event.query.solrQuery(event.getQuery().contentdom, start == 0, event.excludeintext_image);
final SolrQuery solrQuery = event.query.solrQuery(event.getQuery().contentdom, useFacets, event.excludeintext_image);
for (Seed s: robinsonPeers) {
if (MemoryControl.shortStatus()
|| Memory.load() > sb.getConfigFloat(SwitchboardConstants.REMOTESEARCH_MAXLOAD_SOLR,
SwitchboardConstants.REMOTESEARCH_MAXLOAD_SOLR_DEFAULT)) {
continue;
}
Thread t = solrRemoteSearch(event, solrQuery, start, count, s, targets, blacklist);
Thread t = solrRemoteSearch(event, solrQuery, start, count, s, targets, blacklist, useFacets, true);
event.nodeSearchThreads.add(t);
}
}
@@ -365,6 +372,8 @@ public void run() {
* @param targetPeer the target of the Solr query. When null, the query will run on this local peer.
* @param partitions the Solr query "partitions" parameter. Ignored when set to zero.
* @param blacklist the blacklist to use. Can be empty but must not be null.
* @param useSolrFacets when true, use Solr computed facets when possible to update the event navigators counters
* @param incrementNavigators when true, increment event navigators either with facet counts or with individual results
* @return the created and running Thread instance
*/
public static Thread solrRemoteSearch(
@@ -374,7 +383,9 @@ public static Thread solrRemoteSearch(
final int count,
final Seed targetPeer,
final int partitions,
final Blacklist blacklist) {
final Blacklist blacklist,
final boolean useSolrFacets,
final boolean incrementNavigators) {
//System.out.println("*** debug-remoteSearch ***:" + ConcurrentLog.stackTrace());
@@ -396,7 +407,9 @@ public void run() {
count,
targetPeer == null ? event.peers.mySeed() : targetPeer,
partitions,
blacklist);
blacklist,
useSolrFacets,
incrementNavigators);
if (urls >= 0) {
// urls is an array of url hashes. this is only used for log output
event.peers.mySeed().incRI(urls);
Oops, something went wrong.

0 comments on commit 5d3ceb3

Please sign in to comment.