Skip to content

Commit

Permalink
Fixed display of crawler pending URLs counts in HostBrowser.html page.
Browse files Browse the repository at this point in the history
As described in mantis 722 (http://mantis.tokeek.de/view.php?id=722)

Also updated some Javadoc.
  • Loading branch information
luccioman committed Jan 22, 2017
1 parent 870a5ea commit 39e081e
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 5 deletions.
48 changes: 46 additions & 2 deletions htroot/HostBrowser.java
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
import net.yacy.cora.federate.solr.FailType;
import net.yacy.cora.federate.solr.SolrType;
import net.yacy.cora.federate.solr.connector.AbstractSolrConnector;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.sorting.ClusteredScoreMap;
import net.yacy.cora.sorting.ReversibleScoreMap;
Expand All @@ -65,6 +66,9 @@
import net.yacy.server.serverObjects;
import net.yacy.server.serverSwitch;

/**
* Browser for indexed resources
*/
public class HostBrowser {

final static long TIMEOUT = 10000L;
Expand All @@ -73,6 +77,32 @@ public static enum StoreType {
LINK, INDEX, EXCLUDED, FAILED, RELOAD;
}

/**
* <p>Retrieve local index entries for a path, or for hosts with the most references. Also allow some maintaining operations on entries with load errors.</p>
* <p>Some parameters need administrator authentication or unauthenticated local host requests to be allowed : load, deleteLoadErrors, delete and reload404.
* The "load" parameter can also be applied without authentication when "browser.load4everyone" configuration setting is true.</p>
* @param header servlet request header
* @param post request parameters. Supported keys :<ul>
* <li>admin : when "true", display in the html page render the administration context (menu and top navbar)</li>
* <li>path : root URL or host name to browse (ignored when the hosts parameter is filled)</li>
* <li>load : URL to crawl and index. The path URL is crawled and indexed when this parameter is present but empty.</li>
* <li>deleteLoadErrors : delete from the local index documents with load error (HTTP status different from 200 or any other failure).</li>
* <li>hosts : generate hosts with most references list. Supported values :
* <ul>
* <li>"crawling" : restrict to host currently crawled</li>
* <li>"error" : restrict to hosts with having at least one resource load error</li>
* </ul>
* </li>
* <li>delete : delete from the index whole documents tree matching the path prefix</li>
* <li>reload404 : reload documents matching the path prefix and which previously failed to load due to a network error</li>
* <li>facetcount : </li>
* <li>complete : we want only root paths for complete lists</li>
* <li>nepr :</li>
* <li>showlinkstructure : </li>
* </ul>
* @param env server environment
* @return the servlet answer object
*/
@SuppressWarnings({ "unchecked" })
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
// return variable that accumulates replacements
Expand Down Expand Up @@ -209,6 +239,18 @@ url, null, load, new Date(),

// collect hosts from crawler
final Map<String, Integer[]> crawler = (authorized) ? sb.crawlQueues.noticeURL.getDomainStackHosts(StackType.LOCAL, sb.robots) : new HashMap<String, Integer[]>();

final Map<String, Integer> hostNameToPendingCount = new HashMap<>();
for(Entry<String, Integer[]>crawlerEntry: crawler.entrySet()) {
/* The local stack returns keys composed of "hostname:port" : we now sum pending URLs counts by host name */
String hostName = Domains.stripToHostName(crawlerEntry.getKey());
Integer pendingCount = hostNameToPendingCount.get(hostName);
if(pendingCount == null) {
pendingCount = 0;
}
pendingCount += crawlerEntry.getValue()[0];
hostNameToPendingCount.put(hostName, pendingCount);
}

// collect the errorurls
Map<String, ReversibleScoreMap<String>> exclfacets = authorized ? fulltext.getDefaultConnector().getFacets(CollectionSchema.failtype_s.getSolrFieldName() + ":" + FailType.excl.name(), maxcount, CollectionSchema.host_s.getSolrFieldName()) : null;
Expand All @@ -223,13 +265,15 @@ url, null, load, new Date(),
host = i.next();
prop.put("hosts_list_" + c + "_admin", admin ? "true" : "false");
prop.putHTML("hosts_list_" + c + "_host", host);
boolean inCrawler = crawler.containsKey(host);
boolean inCrawler = hostNameToPendingCount.containsKey(host);
int exclcount = exclscore.get(host);
int failcount = failscore.get(host);
int errors = exclcount + failcount;
prop.put("hosts_list_" + c + "_count", hostscore.get(host));
prop.put("hosts_list_" + c + "_crawler", inCrawler ? 1 : 0);
if (inCrawler) prop.put("hosts_list_" + c + "_crawler_pending", crawler.get(host)[0]);
if (inCrawler) {
prop.put("hosts_list_" + c + "_crawler_pending", hostNameToPendingCount.get(host));
}
prop.put("hosts_list_" + c + "_errors", errors > 0 ? 1 : 0);
if (errors > 0) {
prop.put("hosts_list_" + c + "_errors_exclcount", exclcount);
Expand Down
3 changes: 2 additions & 1 deletion source/net/yacy/crawler/Balancer.java
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,8 @@ public interface Balancer {

/**
* get a list of domains that are currently maintained as domain stacks
* @return a map of clear text strings of host names to an integer array: {the size of the domain stack, guessed delta waiting time}
* @return a map of clear text strings of host names (each host name eventually concatenated with a port, depending on the implementation)
* to an integer array: {the size of the domain stack, guessed delta waiting time}
*/
public Map<String, Integer[]> getDomainStackHosts(RobotsTxt robots);

Expand Down
2 changes: 1 addition & 1 deletion source/net/yacy/crawler/HostBalancer.java
Original file line number Diff line number Diff line change
Expand Up @@ -484,7 +484,7 @@ public void remove() {

/**
* get a list of domains that are currently maintained as domain stacks
* @return a map of clear text strings of host names to an integer array: {the size of the domain stack, guessed delta waiting time}
* @return a map of clear text strings of host names + ports to an integer array: {the size of the domain stack, guessed delta waiting time}
*/
@Override
public Map<String, Integer[]> getDomainStackHosts(RobotsTxt robots) {
Expand Down
2 changes: 1 addition & 1 deletion source/net/yacy/crawler/data/NoticedURL.java
Original file line number Diff line number Diff line change
Expand Up @@ -249,7 +249,7 @@ public int removeByHostHash(final Set<String> hosthashes) {

/**
* get a list of domains that are currently maintained as domain stacks
* @return a map of clear text strings of host names to two integers: the size of the domain stacks and the access delta time
* @return a map of clear text strings of host names (each host name eventually concatenated with a port, depending on the stack) to two integers: the size of the domain stacks and the access delta time
*/
public Map<String, Integer[]> getDomainStackHosts(final StackType stackType, RobotsTxt robots) {
switch (stackType) {
Expand Down

0 comments on commit 39e081e

Please sign in to comment.