avoid usage of existsByQuery. If a document can be loaded by the ID

before testing other fields from the existsByQuery request, then a document cache fills and queries after that one can be avoided.
yacy · Dec 12, 2013 · 303f569 · 303f569
1 parent 67e7dc0
commit 303f569
Show file tree

Hide file tree

Showing 8 changed files with 13 additions and 72 deletions.
diff --git a/source/net/yacy/cora/federate/solr/SchemaConfiguration.java b/source/net/yacy/cora/federate/solr/SchemaConfiguration.java
@@ -31,7 +31,6 @@
 import java.util.Set;
 
 import org.apache.solr.common.SolrDocument;
-import org.apache.solr.common.SolrDocumentList;
 import org.apache.solr.common.SolrInputDocument;
 import org.apache.solr.common.SolrInputField;
 
@@ -158,17 +157,15 @@ public boolean postprocessing_doublecontent(Segment segment, Set<String> uniqueU
                         continue uniquecheck;
                     }
                     try {
-                        if (segment.fulltext().getDefaultConnector().existsByQuery(CollectionSchema.host_id_s + ":\"" + hostid + "\" AND " + signaturefield.getSolrFieldName() + ":\"" + checkhash.toString() + "\"")) {
+                        final SolrDocument doc = segment.fulltext().getDefaultConnector().getDocumentById(CollectionSchema.host_id_s + ":\"" + hostid + "\" AND " + signaturefield.getSolrFieldName() + ":\"" + checkhash.toString() + "\"");
+                        if (doc != null) {
                             // switch unique attribute in new document
                             sid.setField(uniquefield.getSolrFieldName(), false);
-                            // switch attribute also in all existing documents (which should be exactly only one!)
-                            SolrDocumentList docs = segment.fulltext().getDefaultConnector().getDocumentListByQuery(CollectionSchema.host_id_s + ":\"" + hostid + "\" AND " + signaturefield.getSolrFieldName() + ":\"" + checkhash.toString() + "\" AND " + uniquefield.getSolrFieldName() + ":true", 0, 1000);
-                            for (SolrDocument doc: docs) {
-                                SolrInputDocument sidContext = segment.fulltext().getDefaultConfiguration().toSolrInputDocument(doc);
-                                sidContext.setField(uniquefield.getSolrFieldName(), false);
-                                segment.putDocumentInQueue(sidContext);
-                                changed = true;
-                            }
+                            // switch attribute in existing document
+                            SolrInputDocument sidContext = segment.fulltext().getDefaultConfiguration().toSolrInputDocument(doc);
+                            sidContext.setField(uniquefield.getSolrFieldName(), false);
+                            segment.putDocumentInQueue(sidContext);
+                            changed = true;
                         } else {
                             sid.setField(uniquefield.getSolrFieldName(), true);
                         }

diff --git a/source/net/yacy/cora/federate/solr/connector/AbstractSolrConnector.java b/source/net/yacy/cora/federate/solr/connector/AbstractSolrConnector.java
@@ -71,16 +71,6 @@ public abstract class AbstractSolrConnector implements SolrConnector {
     }
     protected final static int pagesize = 100;
 
-    @Override
-    public boolean existsByQuery(final String query) throws IOException {
-        try {
-            long count = getCountByQuery(query);
-            return count > 0;
-        } catch (final Throwable e) {
-            return false;
-        }
-    }
-
     /**
      * Get a query result from solr as a stream of documents.
      * The result queue is considered as terminated if AbstractSolrConnector.POISON_DOCUMENT is returned.

diff --git a/source/net/yacy/cora/federate/solr/connector/CachedSolrConnector.java b/source/net/yacy/cora/federate/solr/connector/CachedSolrConnector.java
@@ -122,34 +122,6 @@ public void deleteByQuery(final String querystring) throws IOException {
         this.clearCaches();
         this.solr.deleteByQuery(querystring);
     }
-
-    @Override
-    public boolean existsByQuery(final String query) throws IOException {
-        if (this.hitCache.containsKey(query)) {
-            this.hitCache_Hit++;
-            return true;
-        }
-        this.hitCache_Miss++;
-        if (this.documentCache.containsKey(query)) {
-            this.documentCache_Hit++;
-            return true;
-        }
-        this.documentCache_Miss++;
-        if (this.missCache.containsKey(query)) {
-            this.missCache_Hit++;
-            return false;
-        }
-        this.missCache_Miss++;
-        if (solr != null && solr.existsByQuery(query)) {
-            this.missCache.remove(query);
-            this.hitCache.put(query, EXIST);
-            this.hitCache_Insert++;
-            return true;
-        }
-        this.missCache.put(query, EXIST);
-        this.missCache_Insert++;
-        return false;
-    }
 
     @Override
     public SolrDocument getDocumentById(final String id, final String ... fields) throws IOException {

diff --git a/source/net/yacy/cora/federate/solr/connector/ConcurrentUpdateSolrConnector.java b/source/net/yacy/cora/federate/solr/connector/ConcurrentUpdateSolrConnector.java
@@ -376,12 +376,6 @@ public Set<String> existsByIds(Set<String> ids) throws IOException {
         e.addAll(e1);
         return e;
     }
-
-    @Override
-    public boolean existsByQuery(String solrquery) throws IOException {
-        // this is actually wrong but to make it right we need to wait until all queues are flushed. But that may take very long when the queues are filled again all the time.
-        return this.connector.existsByQuery(solrquery);
-    }
 
     @Override
     public void add(SolrInputDocument solrdoc) throws IOException, SolrException {

diff --git a/source/net/yacy/cora/federate/solr/connector/MirrorSolrConnector.java b/source/net/yacy/cora/federate/solr/connector/MirrorSolrConnector.java
@@ -158,14 +158,6 @@ public void deleteByQuery(final String querystring) throws IOException {
         if (this.solr0 != null) this.solr0.deleteByQuery(querystring);
         if (this.solr1 != null) this.solr1.deleteByQuery(querystring);
     }
-
-    @Override
-    public boolean existsByQuery(final String query) throws IOException {
-        if ((solr0 != null && solr0.existsByQuery(query)) || (solr1 != null && solr1.existsByQuery(query))) {
-            return true;
-        }
-        return false;
-    }
 
     @Override
     public SolrDocument getDocumentById(final String key, final String ... fields) throws IOException {

diff --git a/source/net/yacy/cora/federate/solr/connector/SolrConnector.java b/source/net/yacy/cora/federate/solr/connector/SolrConnector.java
@@ -112,14 +112,6 @@ public interface SolrConnector extends Iterable<String> /* Iterable of document
      * @throws IOException
      */
     public Set<String> existsByIds(Set<String> ids) throws IOException;
-
-    /**
-     * check if a given document exists in solr
-     * @param solrquery
-     * @return true if any entry in solr exists
-     * @throws IOException
-     */
-    public boolean existsByQuery(final String solrquery) throws IOException;
 
     /**
      * add a solr input document

diff --git a/source/net/yacy/crawler/CrawlStacker.java b/source/net/yacy/crawler/CrawlStacker.java
@@ -393,7 +393,7 @@ public String checkAcceptanceInitially(final DigestURL url, final CrawlProfile p
         final String urlstring = url.toString();
         // check if the url is double registered
         final HarvestProcess dbocc = this.nextQueue.exists(url.hash()); // returns the name of the queue if entry exists
-        final Date oldDate = this.indexSegment.fulltext().getLoadDate(ASCII.String(url.hash()));
+        final Date oldDate = this.indexSegment.fulltext().getLoadDate(ASCII.String(url.hash())); // TODO: combine the exists-query with this one
         if (oldDate == null) {
             if (dbocc != null) {
                 // do double-check

diff --git a/source/net/yacy/search/index/ErrorCache.java b/source/net/yacy/search/index/ErrorCache.java
@@ -160,7 +160,11 @@ public CollectionConfiguration.FailDoc get(final String urlhash) {
 
     public boolean exists(final byte[] urlHash) {
         try {
-            return this.fulltext.getDefaultConnector().existsByQuery(CollectionSchema.id.getSolrFieldName() + ":\"" + ASCII.String(urlHash) + "\" AND " + CollectionSchema.failreason_s.getSolrFieldName() + ":[* TO *]");
+            final SolrDocument doc = this.fulltext.getDefaultConnector().getDocumentById(ASCII.String(urlHash), CollectionSchema.failreason_s.getSolrFieldName());
+            if (doc == null) return false;
+            // check if the document contains a value in the field CollectionSchema.failreason_s
+            Object failreason = doc.getFieldValue(CollectionSchema.failreason_s.getSolrFieldName());
+            return failreason == null || failreason.toString().length() == 0;
         } catch (IOException e) {
             return false;
         }