Skip to content

Commit

Permalink
avoid usage of existsByQuery. If a document can be loaded by the ID
Browse files Browse the repository at this point in the history
before testing other fields from the existsByQuery request, then a
document cache fills and queries after that one can be avoided.
  • Loading branch information
Orbiter committed Dec 12, 2013
1 parent 67e7dc0 commit 303f569
Show file tree
Hide file tree
Showing 8 changed files with 13 additions and 72 deletions.
17 changes: 7 additions & 10 deletions source/net/yacy/cora/federate/solr/SchemaConfiguration.java
Expand Up @@ -31,7 +31,6 @@
import java.util.Set;

import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.SolrInputField;

Expand Down Expand Up @@ -158,17 +157,15 @@ public boolean postprocessing_doublecontent(Segment segment, Set<String> uniqueU
continue uniquecheck;
}
try {
if (segment.fulltext().getDefaultConnector().existsByQuery(CollectionSchema.host_id_s + ":\"" + hostid + "\" AND " + signaturefield.getSolrFieldName() + ":\"" + checkhash.toString() + "\"")) {
final SolrDocument doc = segment.fulltext().getDefaultConnector().getDocumentById(CollectionSchema.host_id_s + ":\"" + hostid + "\" AND " + signaturefield.getSolrFieldName() + ":\"" + checkhash.toString() + "\"");
if (doc != null) {
// switch unique attribute in new document
sid.setField(uniquefield.getSolrFieldName(), false);
// switch attribute also in all existing documents (which should be exactly only one!)
SolrDocumentList docs = segment.fulltext().getDefaultConnector().getDocumentListByQuery(CollectionSchema.host_id_s + ":\"" + hostid + "\" AND " + signaturefield.getSolrFieldName() + ":\"" + checkhash.toString() + "\" AND " + uniquefield.getSolrFieldName() + ":true", 0, 1000);
for (SolrDocument doc: docs) {
SolrInputDocument sidContext = segment.fulltext().getDefaultConfiguration().toSolrInputDocument(doc);
sidContext.setField(uniquefield.getSolrFieldName(), false);
segment.putDocumentInQueue(sidContext);
changed = true;
}
// switch attribute in existing document
SolrInputDocument sidContext = segment.fulltext().getDefaultConfiguration().toSolrInputDocument(doc);
sidContext.setField(uniquefield.getSolrFieldName(), false);
segment.putDocumentInQueue(sidContext);
changed = true;
} else {
sid.setField(uniquefield.getSolrFieldName(), true);
}
Expand Down
Expand Up @@ -71,16 +71,6 @@ public abstract class AbstractSolrConnector implements SolrConnector {
}
protected final static int pagesize = 100;

@Override
public boolean existsByQuery(final String query) throws IOException {
try {
long count = getCountByQuery(query);
return count > 0;
} catch (final Throwable e) {
return false;
}
}

/**
* Get a query result from solr as a stream of documents.
* The result queue is considered as terminated if AbstractSolrConnector.POISON_DOCUMENT is returned.
Expand Down
Expand Up @@ -122,34 +122,6 @@ public void deleteByQuery(final String querystring) throws IOException {
this.clearCaches();
this.solr.deleteByQuery(querystring);
}

@Override
public boolean existsByQuery(final String query) throws IOException {
if (this.hitCache.containsKey(query)) {
this.hitCache_Hit++;
return true;
}
this.hitCache_Miss++;
if (this.documentCache.containsKey(query)) {
this.documentCache_Hit++;
return true;
}
this.documentCache_Miss++;
if (this.missCache.containsKey(query)) {
this.missCache_Hit++;
return false;
}
this.missCache_Miss++;
if (solr != null && solr.existsByQuery(query)) {
this.missCache.remove(query);
this.hitCache.put(query, EXIST);
this.hitCache_Insert++;
return true;
}
this.missCache.put(query, EXIST);
this.missCache_Insert++;
return false;
}

@Override
public SolrDocument getDocumentById(final String id, final String ... fields) throws IOException {
Expand Down
Expand Up @@ -376,12 +376,6 @@ public Set<String> existsByIds(Set<String> ids) throws IOException {
e.addAll(e1);
return e;
}

@Override
public boolean existsByQuery(String solrquery) throws IOException {
// this is actually wrong but to make it right we need to wait until all queues are flushed. But that may take very long when the queues are filled again all the time.
return this.connector.existsByQuery(solrquery);
}

@Override
public void add(SolrInputDocument solrdoc) throws IOException, SolrException {
Expand Down
Expand Up @@ -158,14 +158,6 @@ public void deleteByQuery(final String querystring) throws IOException {
if (this.solr0 != null) this.solr0.deleteByQuery(querystring);
if (this.solr1 != null) this.solr1.deleteByQuery(querystring);
}

@Override
public boolean existsByQuery(final String query) throws IOException {
if ((solr0 != null && solr0.existsByQuery(query)) || (solr1 != null && solr1.existsByQuery(query))) {
return true;
}
return false;
}

@Override
public SolrDocument getDocumentById(final String key, final String ... fields) throws IOException {
Expand Down
Expand Up @@ -112,14 +112,6 @@ public interface SolrConnector extends Iterable<String> /* Iterable of document
* @throws IOException
*/
public Set<String> existsByIds(Set<String> ids) throws IOException;

/**
* check if a given document exists in solr
* @param solrquery
* @return true if any entry in solr exists
* @throws IOException
*/
public boolean existsByQuery(final String solrquery) throws IOException;

/**
* add a solr input document
Expand Down
2 changes: 1 addition & 1 deletion source/net/yacy/crawler/CrawlStacker.java
Expand Up @@ -393,7 +393,7 @@ public String checkAcceptanceInitially(final DigestURL url, final CrawlProfile p
final String urlstring = url.toString();
// check if the url is double registered
final HarvestProcess dbocc = this.nextQueue.exists(url.hash()); // returns the name of the queue if entry exists
final Date oldDate = this.indexSegment.fulltext().getLoadDate(ASCII.String(url.hash()));
final Date oldDate = this.indexSegment.fulltext().getLoadDate(ASCII.String(url.hash())); // TODO: combine the exists-query with this one
if (oldDate == null) {
if (dbocc != null) {
// do double-check
Expand Down
6 changes: 5 additions & 1 deletion source/net/yacy/search/index/ErrorCache.java
Expand Up @@ -160,7 +160,11 @@ public CollectionConfiguration.FailDoc get(final String urlhash) {

public boolean exists(final byte[] urlHash) {
try {
return this.fulltext.getDefaultConnector().existsByQuery(CollectionSchema.id.getSolrFieldName() + ":\"" + ASCII.String(urlHash) + "\" AND " + CollectionSchema.failreason_s.getSolrFieldName() + ":[* TO *]");
final SolrDocument doc = this.fulltext.getDefaultConnector().getDocumentById(ASCII.String(urlHash), CollectionSchema.failreason_s.getSolrFieldName());
if (doc == null) return false;
// check if the document contains a value in the field CollectionSchema.failreason_s
Object failreason = doc.getFieldValue(CollectionSchema.failreason_s.getSolrFieldName());
return failreason == null || failreason.toString().length() == 0;
} catch (IOException e) {
return false;
}
Expand Down

0 comments on commit 303f569

Please sign in to comment.