Skip to content

Commit

Permalink
- better logging when rejecting a url because it is not in declared d…
Browse files Browse the repository at this point in the history
…omain

- more XSS attack protection

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4720 6c8d7289-2bf4-0310-a012-ef5d649a1542
  • Loading branch information
orbiter committed Apr 20, 2008
1 parent 6d1be66 commit 5e3ce46
Show file tree
Hide file tree
Showing 16 changed files with 55 additions and 39 deletions.
2 changes: 1 addition & 1 deletion htroot/AccessTracker_p.java
Expand Up @@ -146,7 +146,7 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
if (page == 2) {
// local search
prop.putNum("page_list_" + entCount + "_offset", searchProfile.offset);
prop.put("page_list_" + entCount + "_querystring", searchProfile.queryString);
prop.putHTML("page_list_" + entCount + "_querystring", searchProfile.queryString);
} else {
// remote search
prop.putHTML("page_list_" + entCount + "_peername", (searchProfile.remotepeer == null) ? "<unknown>" : searchProfile.remotepeer.getName());
Expand Down
2 changes: 1 addition & 1 deletion htroot/Config_p.java
Expand Up @@ -83,7 +83,7 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
while(keys.hasNext()){
key = (String) keys.next();
prop.put("options_"+count+"_key", key);
prop.put("options_"+count+"_value", env.getConfig(key, "ERROR"));
prop.putHTML("options_"+count+"_value", env.getConfig(key, "ERROR"));
count++;
}

Expand Down
2 changes: 1 addition & 1 deletion htroot/Connections_p.java
Expand Up @@ -208,7 +208,7 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
prop.put("list_" + idx + "_ms", "1");
prop.putNum("list_" + idx + "_ms_duration", sessionTime);
}
prop.put("list_" + idx + "_source",(seed!=null)?seed.getName()+".yacy":userAddress.getHostAddress()+":"+userPort);
prop.putHTML("list_" + idx + "_source",(seed!=null)?seed.getName()+".yacy":userAddress.getHostAddress()+":"+userPort);
prop.put("list_" + idx + "_dest",(dest==null)?"-":dest);
if (blockingRequest) {
prop.put("list_" + idx + "_running", "0");
Expand Down
10 changes: 5 additions & 5 deletions htroot/IndexCreateIndexingQueue_p.java
Expand Up @@ -138,11 +138,11 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
totalSize += entrySize;
initiator = yacyCore.seedDB.getConnected(pcentry.initiator());
prop.put("indexing-queue_list_"+entryCount+"_dark", inProcess ? "2" : (dark ? "1" : "0"));
prop.put("indexing-queue_list_"+entryCount+"_initiator", ((initiator == null) ? "proxy" : initiator.getName()));
prop.putHTML("indexing-queue_list_"+entryCount+"_initiator", ((initiator == null) ? "proxy" : initiator.getName()));
prop.put("indexing-queue_list_"+entryCount+"_depth", pcentry.depth());
prop.put("indexing-queue_list_"+entryCount+"_modified", pcentry.getModificationDate().toString());
prop.putHTML("indexing-queue_list_"+entryCount+"_anchor", (pcentry.anchorName()==null)?"":pcentry.anchorName());
prop.put("indexing-queue_list_"+entryCount+"_url", pcentry.url().toNormalform(false, true));
prop.putHTML("indexing-queue_list_"+entryCount+"_url", pcentry.url().toNormalform(false, true));
prop.put("indexing-queue_list_"+entryCount+"_size", serverMemory.bytesToString(entrySize));
prop.put("indexing-queue_list_"+entryCount+"_inProcess", inProcess ? "1" :"0");
prop.put("indexing-queue_list_"+entryCount+"_inProcess_hash", pcentry.urlHash());
Expand Down Expand Up @@ -185,9 +185,9 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
executorHash = entry.executor();
initiatorSeed = yacyCore.seedDB.getConnected(initiatorHash);
executorSeed = yacyCore.seedDB.getConnected(executorHash);
prop.put("rejected_list_"+j+"_initiator", ((initiatorSeed == null) ? "proxy" : initiatorSeed.getName()));
prop.put("rejected_list_"+j+"_executor", ((executorSeed == null) ? "proxy" : executorSeed.getName()));
prop.put("rejected_list_"+j+"_url", url.toNormalform(false, true));
prop.putHTML("rejected_list_"+j+"_initiator", ((initiatorSeed == null) ? "proxy" : initiatorSeed.getName()));
prop.putHTML("rejected_list_"+j+"_executor", ((executorSeed == null) ? "proxy" : executorSeed.getName()));
prop.putHTML("rejected_list_"+j+"_url", url.toNormalform(false, true));
prop.putHTML("rejected_list_"+j+"_failreason", entry.anycause());
prop.put("rejected_list_"+j+"_dark", dark ? "1" : "0");
dark = !dark;
Expand Down
4 changes: 2 additions & 2 deletions htroot/IndexCreateLoaderQueue_p.java
Expand Up @@ -72,10 +72,10 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve

initiator = yacyCore.seedDB.getConnected(w[i].initiator());
prop.put("loader-set_list_"+count+"_dark", dark ? "1" : "0");
prop.put("loader-set_list_"+count+"_initiator", ((initiator == null) ? "proxy" : initiator.getName()));
prop.putHTML("loader-set_list_"+count+"_initiator", ((initiator == null) ? "proxy" : initiator.getName()));
prop.put("loader-set_list_"+count+"_depth", w[i].depth());
prop.put("loader-set_list_"+count+"_status", w[i].getStatus());
prop.put("loader-set_list_"+count+"_url", w[i].url().toNormalform(true, false));
prop.putHTML("loader-set_list_"+count+"_url", w[i].url().toNormalform(true, false));
dark = !dark;
count++;
}
Expand Down
4 changes: 2 additions & 2 deletions htroot/IndexCreateWWWGlobalQueue_p.java
Expand Up @@ -119,12 +119,12 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
profileHandle = urle.profileHandle();
profileEntry = (profileHandle == null) ? null : switchboard.profilesActiveCrawls.getEntry(profileHandle);
prop.put("crawler-queue_list_"+showNum+"_dark", dark ? "1" : "0");
prop.put("crawler-queue_list_"+showNum+"_initiator", ((initiator == null) ? "proxy" : initiator.getName()) );
prop.putHTML("crawler-queue_list_"+showNum+"_initiator", ((initiator == null) ? "proxy" : initiator.getName()) );
prop.put("crawler-queue_list_"+showNum+"_profile", ((profileEntry == null) ? "unknown" : profileEntry.name()));
prop.put("crawler-queue_list_"+showNum+"_depth", urle.depth());
prop.put("crawler-queue_list_"+showNum+"_modified", daydate(urle.loaddate()) );
prop.putHTML("crawler-queue_list_"+showNum+"_anchor", urle.name());
prop.put("crawler-queue_list_"+showNum+"_url", urle.url().toNormalform(false, true));
prop.putHTML("crawler-queue_list_"+showNum+"_url", urle.url().toNormalform(false, true));
prop.put("crawler-queue_list_"+showNum+"_hash", urle.url().hash());
dark = !dark;
showNum++;
Expand Down
4 changes: 2 additions & 2 deletions htroot/IndexCreateWWWLocalQueue_p.java
Expand Up @@ -184,12 +184,12 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
profileHandle = urle.profileHandle();
profileEntry = (profileHandle == null) ? null : sb.profilesActiveCrawls.getEntry(profileHandle);
prop.put("crawler-queue_list_"+showNum+"_dark", dark ? "1" : "0");
prop.put("crawler-queue_list_"+showNum+"_initiator", ((initiator == null) ? "proxy" : initiator.getName()) );
prop.putHTML("crawler-queue_list_"+showNum+"_initiator", ((initiator == null) ? "proxy" : initiator.getName()) );
prop.put("crawler-queue_list_"+showNum+"_profile", ((profileEntry == null) ? "unknown" : profileEntry.name()));
prop.put("crawler-queue_list_"+showNum+"_depth", urle.depth());
prop.put("crawler-queue_list_"+showNum+"_modified", daydate(urle.loaddate()) );
prop.putHTML("crawler-queue_list_"+showNum+"_anchor", urle.name());
prop.put("crawler-queue_list_"+showNum+"_url", urle.url().toNormalform(false, true));
prop.putHTML("crawler-queue_list_"+showNum+"_url", urle.url().toNormalform(false, true));
prop.put("crawler-queue_list_"+showNum+"_hash", urle.url().hash());
dark = !dark;
showNum++;
Expand Down
4 changes: 2 additions & 2 deletions htroot/IndexCreateWWWRemoteQueue_p.java
Expand Up @@ -119,12 +119,12 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
profileHandle = urle.profileHandle();
profileEntry = (profileHandle == null) ? null : sb.profilesActiveCrawls.getEntry(profileHandle);
prop.put("crawler-queue_list_" + showNum + "_dark", dark ? "1" : "0");
prop.put("crawler-queue_list_" + showNum + "_initiator", ((initiator == null) ? "proxy" : initiator.getName()));
prop.putHTML("crawler-queue_list_" + showNum + "_initiator", ((initiator == null) ? "proxy" : initiator.getName()));
prop.put("crawler-queue_list_" + showNum + "_profile", ((profileEntry == null) ? "unknown" : profileEntry.name()));
prop.put("crawler-queue_list_" + showNum + "_depth", urle.depth());
prop.put("crawler-queue_list_" + showNum + "_modified", daydate(urle.loaddate()) );
prop.putHTML("crawler-queue_list_" + showNum + "_anchor", urle.name());
prop.put("crawler-queue_list_" + showNum + "_url", urle.url().toString());
prop.putHTML("crawler-queue_list_" + showNum + "_url", urle.url().toString());
prop.put("crawler-queue_list_" + showNum + "_hash", urle.url().hash());
dark = !dark;
showNum++;
Expand Down
5 changes: 3 additions & 2 deletions htroot/rct_p.java
Expand Up @@ -73,7 +73,8 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
loaddate = new Date();
}
yacyURL referrer = null; // referrer needed!
if (sb.acceptURL(url)) {
String urlRejectReason = sb.acceptURL(url);
if (urlRejectReason == null) {
// stack url
sb.getLog().logFinest("crawlOrder: stack: url='" + url + "'");
String reasonString = sb.crawlStacker.stackCrawl(url, referrer, peerhash, "REMOTE-CRAWLING", loaddate, 0, sb.defaultRemoteProfile);
Expand All @@ -88,7 +89,7 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
env.getLog().logInfo("crawlOrder: ignored [" + reasonString + "] remote crawl url: " + url.toNormalform(true, false));
}
} else {
env.getLog().logWarning("crawlOrder: Received URL outside of our domain: " + url.toNormalform(true, false));
env.getLog().logWarning("crawlOrder: Rejected URL '" + url.toNormalform(true, false) + "': " + urlRejectReason);
}
}
}
Expand Down
5 changes: 3 additions & 2 deletions htroot/yacy/crawlReceipt.java
Expand Up @@ -147,8 +147,9 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
}

// check if the entry is in our network domain
if (!switchboard.acceptURL(comp.url())) {
log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT (url outside of our domain) for hash " + entry.hash() + " from peer " + iam + "\n\tURL properties: "+ propStr);
String urlRejectReason = switchboard.acceptURL(comp.url());
if (urlRejectReason != null) {
log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT (" + urlRejectReason + ") for hash " + entry.hash() + " from peer " + iam + "\n\tURL properties: "+ propStr);
prop.put("delay", "9999");
return prop;
}
Expand Down
5 changes: 3 additions & 2 deletions htroot/yacy/transferURL.java
Expand Up @@ -145,8 +145,9 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
}

// check if the entry is in our network domain
if (!sb.acceptURL(comp.url())) {
yacyCore.log.logFine("transferURL: blocked URL outside of our domain '" + comp.url().toNormalform(false, true) + "' from peer " + otherPeerName);
String urlRejectReason = sb.acceptURL(comp.url());
if (urlRejectReason != null) {
yacyCore.log.logFine("transferURL: blocked URL '" + comp.url() + "' (" + urlRejectReason + ") from peer " + otherPeerName);
lEntry = null;
blocked++;
continue;
Expand Down
5 changes: 3 additions & 2 deletions source/de/anomic/plasma/crawler/plasmaCrawlQueues.java
Expand Up @@ -321,7 +321,8 @@ public boolean remoteCrawlLoaderJob() {
} catch (ParseException e) {
loaddate = new Date();
}
if (sb.acceptURL(url)) {
String urlRejectReason = sb.acceptURL(url);
if (urlRejectReason == null) {
// stack url
sb.getLog().logFinest("crawlOrder: stack: url='" + url + "'");
String reasonString = sb.crawlStacker.stackCrawl(url, referrer, hash, item.getDescription(), loaddate, 0, sb.defaultRemoteProfile);
Expand All @@ -336,7 +337,7 @@ public boolean remoteCrawlLoaderJob() {
log.logInfo("crawlOrder: ignored [" + reasonString + "] remote crawl url: " + url.toNormalform(true, false));
}
} else {
log.logWarning("crawlOrder: Received URL outside of our domain: " + url.toNormalform(true, false));
log.logWarning("crawlOrder: Rejected URL '" + url.toNormalform(true, false) + "': " + urlRejectReason);
}
}
return true;
Expand Down
1 change: 0 additions & 1 deletion source/de/anomic/plasma/plasmaCrawlEURL.java
Expand Up @@ -36,7 +36,6 @@ public class plasmaCrawlEURL {
public static final String DENIED_URL_NULL = "denied_(url_null)";
public static final String DENIED_MALFORMED_URL = "denied_(malformed_url)";
public static final String DENIED_UNSUPPORTED_PROTOCOL = "denied_(unsupported_protocol)";
public static final String DENIED_IP_ADDRESS_NOT_IN_DECLARED_DOMAIN = "denied_(address_not_in_declared_domain)";
public static final String DENIED_LOOPBACK_IP_ADDRESS = "denied_(loopback_ip_address)";
public static final String DENIED_CACHEFILE_PATH_TOO_LONG = "denied_(cachefile_path_too_long)";
public static final String DENIED_INVALID_CACHEFILE_PATH = "denied_(invalid_cachefile_path)";
Expand Down
8 changes: 4 additions & 4 deletions source/de/anomic/plasma/plasmaCrawlStacker.java
Expand Up @@ -385,10 +385,10 @@ public String stackCrawl(plasmaCrawlEntry entry) {
}

// check if ip is local ip address
if (!sb.acceptURL(entry.url())) {
reason = plasmaCrawlEURL.DENIED_IP_ADDRESS_NOT_IN_DECLARED_DOMAIN + "[" + sb.getConfig("network.unit.domain", "unknown") + "]";
if (this.log.isFine()) this.log.logFine("Host in URL '" + entry.url().toString() + "' has IP address outside of declared range (" + sb.getConfig("network.unit.domain", "unknown") + "). " +
"Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");
String urlRejectReason = sb.acceptURL(entry.url());
if (urlRejectReason != null) {
reason = "denied_(" + urlRejectReason + ")_domain=" + sb.getConfig("network.unit.domain", "unknown");
if (this.log.isFine()) this.log.logFine(reason + "Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");
return reason;
}

Expand Down
28 changes: 20 additions & 8 deletions source/de/anomic/plasma/plasmaSwitchboard.java
Expand Up @@ -1427,21 +1427,32 @@ public boolean isInMyCluster(yacySeed seed) {
}
}

public boolean acceptURL(yacyURL url) {
/**
* Test a url if it can be used for crawling/indexing
* This mainly checks if the url is in the declared domain (local/global)
* @param url
* @return null if the url can be accepted, a string containing a rejection reason if the url cannot be accepted
*/
public String acceptURL(yacyURL url) {
// returns true if the url can be accepted accoring to network.unit.domain
if (url == null) return false;
if (url == null) return "url is null";
String host = url.getHost();
if (host == null) return false;
if (this.acceptGlobalURLs && this.acceptLocalURLs) return true; // fast shortcut to avoid dnsResolve
if (host == null) return "url.host is null";
if (this.acceptGlobalURLs && this.acceptLocalURLs) return null; // fast shortcut to avoid dnsResolve
InetAddress hostAddress = serverDomains.dnsResolve(host);
// if we don't know the host, we cannot load that resource anyway.
// But in case we use a proxy, it is possible that we dont have a DNS service.
final httpRemoteProxyConfig remoteProxyConfig = httpdProxyHandler.getRemoteProxyConfig();
if (hostAddress == null) return ((remoteProxyConfig != null) && (remoteProxyConfig.useProxy()));
if (hostAddress == null) {
if ((remoteProxyConfig != null) && (remoteProxyConfig.useProxy())) return null; else return "the dns of the host '" + host + "' cannot be resolved";
}
// check if this is a local address and we are allowed to index local pages:
boolean local = hostAddress.isSiteLocalAddress() || hostAddress.isLoopbackAddress();
//assert local == yacyURL.isLocalDomain(url.hash()); // TODO: remove the dnsResolve above!
return (this.acceptGlobalURLs && !local) || (this.acceptLocalURLs && local);
if ((this.acceptGlobalURLs && !local) || (this.acceptLocalURLs && local)) return null;
return (local) ?
("the host '" + host + "' is local, but local addresses are not accepted") :
("the host '" + host + "' is global, but global addresses are not accepted");
}

public String urlExists(String hash) {
Expand Down Expand Up @@ -1631,8 +1642,9 @@ public boolean htEntryStoreProcess(plasmaHTCache.Entry entry) {
*
* check if ip is local ip address // TODO: remove this procotol specific code here
* ========================================================================= */
if (!acceptURL(entry.url())) {
if (this.log.isFine()) this.log.logFine("Host in URL '" + entry.url() + "' is not in defined indexing domain.");
String urlRejectReason = acceptURL(entry.url());
if (urlRejectReason != null) {
if (this.log.isFine()) this.log.logFine("Rejected URL '" + entry.url() + "': " + urlRejectReason);
doIndexing = false;
}

Expand Down
5 changes: 3 additions & 2 deletions source/de/anomic/yacy/yacyClient.java
Expand Up @@ -536,8 +536,9 @@ public static String[] search(
continue; // block with backlist
}

if (!plasmaSwitchboard.getSwitchboard().acceptURL(comp.url())) {
yacyCore.log.logInfo("remote search (client): rejected url outside of our domain " + comp.url() + " from peer " + target.getName());
String urlRejectReason = plasmaSwitchboard.getSwitchboard().acceptURL(comp.url());
if (urlRejectReason != null) {
yacyCore.log.logInfo("remote search (client): rejected url '" + comp.url() + "' (" + urlRejectReason + ") from peer " + target.getName());
continue; // reject url outside of our domain
}

Expand Down

0 comments on commit 5e3ce46

Please sign in to comment.