Skip to content

Commit

Permalink
*) adding more urls to the error url
Browse files Browse the repository at this point in the history
   - old error strings where replaced with there corresponding constants   
   See: http://www.yacy-forum.de/viewtopic.php?t=2638

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2360 6c8d7289-2bf4-0310-a012-ef5d649a1542
  • Loading branch information
theli committed Aug 7, 2006
1 parent d56f064 commit 9f29808
Show file tree
Hide file tree
Showing 5 changed files with 182 additions and 47 deletions.
62 changes: 62 additions & 0 deletions source/de/anomic/plasma/plasmaCrawlEURL.java
Expand Up @@ -6,6 +6,10 @@
// Frankfurt, Germany, 2004
// last major change: 09.08.2004
//
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 1986 $
// $LastChangedBy: orbiter $
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
Expand Down Expand Up @@ -59,6 +63,64 @@

public class plasmaCrawlEURL extends indexURL {

/* =======================================================================
* Failure reason constants
* ======================================================================= */
// invalid urls
public static final String DENIED_URL_NULL = "denied_(url_null)";
public static final String DENIED_MALFORMED_URL = "denied_(malformed_url)";
public static final String DENIED_PRIVATE_IP_ADDRESS = "denied_(private_ip_address)";
public static final String DENIED_LOOPBACK_IP_ADDRESS = "denied_(loopback_ip_address)";
public static final String DENIED_CACHEFILE_PATH_TOO_LONG = "denied_(cachefile_path_too_long)";
public static final String DENIED_INVALID_CACHEFILE_PATH = "denied_(invalid_cachefile_path)";

// blacklisted/blocked urls
public static final String DENIED_URL_IN_BLACKLIST = "denied_(url_in_blacklist)";
public static final String DENIED_URL_DOES_NOT_MATCH_FILTER = "denied_(does_not_match_filter)";
public static final String DENIED_CGI_URL = "denied_(cgi_url)";
public static final String DENIED_POST_URL = "denied_(post_url)";
public static final String DENIED_NO_MATCH_WITH_DOMAIN_FILTER = "denied_(no_match_with_domain_filter)";
public static final String DENIED_DOMAIN_COUNT_EXCEEDED = "denied_(domain_count_exceeded)";
public static final String DENIED_ROBOTS_TXT = "denied_(robots.txt)";

// wrong content
public static final String DENIED_WRONG_MIMETYPE_OR_EXT = "denied_(wrong_mimetype_or_extension)";
public static final String DENIED_REDIRECTION_HEADER_EMPTY = "denied_(redirection_header_empty)";
public static final String DENIED_REDIRECTION_COUNTER_EXCEEDED = "denied_(redirection_counter_exceeded)";
public static final String DENIED_WRONG_HTTP_STATUSCODE = "denied_(wrong_http_status_code_";
public static final String DENIED_CONTENT_DECODING_ERROR = "denied_(content_decoding_error)";

// network errors
public static final String DENIED_UNKNOWN_HOST = "denied_(unknown_host)";
public static final String DENIED_NO_ROUTE_TO_HOST = "denied_(no_route_to_host)";
public static final String DENIED_NETWORK_IS_UNREACHABLE = "denied_(Network_is_unreachable)";

// connection errors
public static final String DENIED_CONNECTION_ERROR = "denied_(socket_connection_error)";
public static final String DENIED_CONNECTION_BIND_EXCEPTION = "denied_(connection_bind_exception)";
public static final String DENIED_CONNECTION_TIMEOUT = "denied_(connection_timeout)";
public static final String DENIED_CONNECTION_REFUSED = "denied_(connection_refused)";
public static final String DENIED_SSL_UNTRUSTED_CERT = "denied_(No_trusted_ssl_certificate_found)";

// double registered errors
public static final String DOUBLE_REGISTERED = "double_(registered_in_";

// server errors
public static final String DENIED_OUT_OF_DISK_SPACE = "denied_(out_of_disk_space)";
public static final String DENIED_SERVER_SHUTDOWN = "denied_(server_shutdown)";

// Parser errors
public static final String DENIED_PARSER_ERROR = "denied_(parser_error)";
public static final String DENIED_NOT_PARSEABLE_NO_CONTENT = "denied_(not_parseabel_no_content)";

// indexing errors
public static final String DENIED_UNSPECIFIED_INDEXING_ERROR = "denied_(unspecified_indexing_error)";
public static final String DENIED_UNKNOWN_INDEXING_PROCESS_CASE = "denied_(unknown_indexing_process_case)";


/* =======================================================================
* Other object variables
* ======================================================================= */
private LinkedList rejectedStack = new LinkedList(); // strings: url

public plasmaCrawlEURL(File cachePath, int bufferkb, long preloadTime) {
Expand Down
26 changes: 13 additions & 13 deletions source/de/anomic/plasma/plasmaCrawlStacker.java
Expand Up @@ -231,7 +231,7 @@ public String stackCrawl(String nexturlString, String referrerString, String ini

// strange errors
if (nexturlString == null) {
reason = "denied_(url_null)";
reason = plasmaCrawlEURL.DENIED_URL_NULL;
this.log.logSevere("Wrong URL in stackCrawl: url=null");
return reason;
}
Expand All @@ -256,7 +256,7 @@ public String stackCrawl(String nexturlString, String referrerString, String ini
try {
nexturl = new URL(nexturlString);
} catch (MalformedURLException e) {
reason = "denied_(url_'" + nexturlString + "'_wrong)";
reason = plasmaCrawlEURL.DENIED_MALFORMED_URL;
this.log.logSevere("Wrong URL in stackCrawl: " + nexturlString +
". Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");
return reason;
Expand All @@ -265,33 +265,33 @@ public String stackCrawl(String nexturlString, String referrerString, String ini
// check if ip is local ip address
InetAddress hostAddress = httpc.dnsResolve(nexturl.getHost());
if (hostAddress == null) {
reason = "denied_(unknown_host)";
reason = plasmaCrawlEURL.DENIED_UNKNOWN_HOST;
this.log.logFine("Unknown host in URL '" + nexturlString + "'. " +
"Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");
return reason;
} else if (hostAddress.isSiteLocalAddress()) {
reason = "denied_(private_ip_address)";
reason = plasmaCrawlEURL.DENIED_PRIVATE_IP_ADDRESS;
this.log.logFine("Host in URL '" + nexturlString + "' has private IP address. " +
"Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");
return reason;
} else if (hostAddress.isLoopbackAddress()) {
reason = "denied_(loopback_ip_address)";
reason = plasmaCrawlEURL.DENIED_LOOPBACK_IP_ADDRESS;
this.log.logFine("Host in URL '" + nexturlString + "' has loopback IP address. " +
"Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");
return reason;
}

// check blacklist
if (plasmaSwitchboard.urlBlacklist.isListed(nexturl)) {
reason = "denied_(url_in_blacklist)";
reason = plasmaCrawlEURL.DENIED_URL_IN_BLACKLIST;
this.log.logFine("URL '" + nexturlString + "' is in blacklist. " +
"Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");
return reason;
}

// filter deny
if ((currentdepth > 0) && (profile != null) && (!(nexturlString.matches(profile.generalFilter())))) {
reason = "denied_(does_not_match_filter)";
reason = plasmaCrawlEURL.DENIED_URL_DOES_NOT_MATCH_FILTER;
/*
urlPool.errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash,
name, reason, new bitfield(plasmaURL.urlFlagLength), false);*/
Expand All @@ -302,7 +302,7 @@ public String stackCrawl(String nexturlString, String referrerString, String ini

// deny cgi
if (plasmaHTCache.isCGI(nexturlString)) {
reason = "denied_(cgi_url)";
reason = plasmaCrawlEURL.DENIED_CGI_URL;
/*
urlPool.errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash,
name, reason, new bitfield(plasmaURL.urlFlagLength), false);*/
Expand All @@ -313,7 +313,7 @@ public String stackCrawl(String nexturlString, String referrerString, String ini

// deny post properties
if ((plasmaHTCache.isPOST(nexturlString)) && (profile != null) && (!(profile.crawlingQ()))) {
reason = "denied_(post_url)";
reason = plasmaCrawlEURL.DENIED_POST_URL;
/*
urlPool.errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash,
name, reason, new bitfield(plasmaURL.urlFlagLength), false);*/
Expand All @@ -329,15 +329,15 @@ public String stackCrawl(String nexturlString, String referrerString, String ini

// deny urls that do not match with the profile domain list
if (!(profile.grantedDomAppearance(nexturl.getHost()))) {
reason = "denied_(no_match_with_domain_filter)";
reason = plasmaCrawlEURL.DENIED_NO_MATCH_WITH_DOMAIN_FILTER;
this.log.logFine("URL '" + nexturlString + "' is not listed in granted domains. " +
"Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");
return reason;
}

// deny urls that exceed allowed number of occurrences
if (!(profile.grantedDomCount(nexturl.getHost()))) {
reason = "denied_(domain_count_exceeded)";
reason = plasmaCrawlEURL.DENIED_DOMAIN_COUNT_EXCEEDED;
this.log.logFine("URL '" + nexturlString + "' appeared too often, a maximum of " + profile.domMaxPages() + " is allowed. "+
"Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");
return reason;
Expand All @@ -352,7 +352,7 @@ public String stackCrawl(String nexturlString, String referrerString, String ini
boolean recrawl = (oldEntry != null) &&
(((System.currentTimeMillis() - oldEntry.loaddate().getTime()) / 60000) > profile.recrawlIfOlder());
if ((dbocc != null) && (!(recrawl))) {
reason = "double_(registered_in_" + dbocc + ")";
reason = plasmaCrawlEURL.DOUBLE_REGISTERED + dbocc + ")";
/*
urlPool.errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash,
name, reason, new bitfield(plasmaURL.urlFlagLength), false);*/
Expand All @@ -363,7 +363,7 @@ public String stackCrawl(String nexturlString, String referrerString, String ini

// checking robots.txt
if (robotsParser.isDisallowed(nexturl)) {
reason = "denied_(robots.txt)";
reason = plasmaCrawlEURL.DENIED_ROBOTS_TXT;
/*
urlPool.errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash,
name, reason, new bitfield(plasmaURL.urlFlagLength), false);*/
Expand Down

0 comments on commit 9f29808

Please sign in to comment.