Skip to content

Commit

Permalink
*) Changing robots parser cxclusion policy
Browse files Browse the repository at this point in the history
   - crawling is now allowed if server returned a 403 statuscode 
     when trying to download the robots.txt
   See: http://www.yacy-forum.de/viewtopic.php?t=1612

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1421 6c8d7289-2bf4-0310-a012-ef5d649a1542
  • Loading branch information
theli committed Jan 24, 2006
1 parent c69f7a3 commit 754a358
Showing 1 changed file with 2 additions and 2 deletions.
4 changes: 2 additions & 2 deletions source/de/anomic/data/robotsParser.java
Expand Up @@ -293,7 +293,7 @@ private static Object[] downloadRobotsTxt(URL robotsURL, int redirectionCount, p
if (modDate != null) reqHeaders.put(httpHeader.IF_MODIFIED_SINCE,httpc.dateString(entry.getModDate()));
}

httpc.response res = con.GET(robotsURL.getPath(), reqHeaders);
httpc.response res = con.GET(robotsURL.getFile(), reqHeaders);
if (res.status.startsWith("2")) {
if (!res.responseHeader.mime().startsWith("text/plain")) {
robotsTxt = null;
Expand Down Expand Up @@ -336,7 +336,7 @@ private static Object[] downloadRobotsTxt(URL robotsURL, int redirectionCount, p
"\nRedirecting request to: " + redirectionUrl);
return downloadRobotsTxt(redirectionUrl,redirectionCount,entry);

} else if (res.status.startsWith("401") || res.status.startsWith("403")) {
} else if (res.status.startsWith("401")/* || res.status.startsWith("403") */) {
accessCompletelyRestricted = true;
serverLog.logFinest("ROBOTS","Access to Robots.txt not allowed on URL '" + robotsURL + "'.");
} else {
Expand Down

0 comments on commit 754a358

Please sign in to comment.