Skip to content

Commit

Permalink
Robots : properly handle URLs including non ASCII characters
Browse files Browse the repository at this point in the history
This fixes GitHub issue 80 (
#80 ) reported by
Lord-Protector.
  • Loading branch information
luccioman committed Oct 12, 2016
1 parent 75bb77f commit 8b341e9
Showing 1 changed file with 14 additions and 3 deletions.
17 changes: 14 additions & 3 deletions source/net/yacy/crawler/robots/RobotsTxtEntry.java
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.util.ByteArray;
import net.yacy.cora.util.ConcurrentLog;


public class RobotsTxtEntry {
Expand Down Expand Up @@ -221,9 +222,19 @@ public boolean isDisallowed(final MultiProtocolURL subpathURL) {
}

// if the path is null or empty we set it to /
if (path == null || path.isEmpty()) path = "/";
// escaping all occurences of ; because this char is used as special char in the Robots DB
else path = RobotsTxt.ROBOTS_DB_PATH_SEPARATOR_MATCHER.matcher(path).replaceAll("%3B");
if (path == null || path.isEmpty()) {
path = "/";
} else {
/* non-ASCII characters : let's apply the same decoding as the one used to create the denyPathList (see RobotsTxtParser.parse()) */
try {
path = UTF8.decodeURL(path);
} catch(Exception e) {
ConcurrentLog.warn(RobotsTxtEntry.class.getName(), "Could not decode path : " + path);

}
// escaping all occurences of ; because this char is used as special char in the Robots DB
path = RobotsTxt.ROBOTS_DB_PATH_SEPARATOR_MATCHER.matcher(path).replaceAll("%3B");
}

for (final String element : this.denyPathList) {

Expand Down

0 comments on commit 8b341e9

Please sign in to comment.