Skip to content

Commit

Permalink
*) Better handling of robots.txt files with incorrect keywords
Browse files Browse the repository at this point in the history
  • Loading branch information
theli committed Nov 6, 2005
1 parent a1406f4 commit f9fb284
Showing 1 changed file with 11 additions and 4 deletions.
15 changes: 11 additions & 4 deletions source/de/anomic/data/robotsParser.java
Expand Up @@ -98,16 +98,17 @@ public static ArrayList parse(BufferedReader reader) throws IOException{
ArrayList deny = new ArrayList();

int pos;
String line = null;
boolean rule4Yacy = false;
String line = null, lineUpper = null;
boolean rule4Yacy = false;
while ((line = reader.readLine()) != null) {
line = line.trim();
lineUpper = line.toUpperCase();
if (line.length() == 0) {
// we have reached the end of the rule block
rule4Yacy = false;
} else if (line.startsWith("#")) {
// we can ignore this. Just a comment line
} else if ((!rule4Yacy) && (line.startsWith("User-agent:"))) {
} else if ((!rule4Yacy) && (lineUpper.startsWith("User-agent:".toUpperCase()))) {
// cutting off comments at the line end
pos = line.indexOf("#");
if (pos != -1) {
Expand All @@ -120,7 +121,13 @@ public static ArrayList parse(BufferedReader reader) throws IOException{
String userAgent = line.substring(pos).trim();
rule4Yacy = (userAgent.equals("*") || (userAgent.toLowerCase().indexOf("yacy") >=0));
}
} else if (line.startsWith("Disallow:") && rule4Yacy) {
} else if (lineUpper.startsWith("Disallow:".toUpperCase()) && rule4Yacy) {
// cutting off comments at the line end
pos = line.indexOf("#");
if (pos != -1) {
line = line.substring(0,pos);
}

pos = line.indexOf(" ");
if (pos != -1) {
// getting the path
Expand Down

0 comments on commit f9fb284

Please sign in to comment.