Skip to content
Permalink
Browse files

Improved normalization of blacklist path patterns having non ascii chars

Normalize blacklist path patterns using percent-encoding, at pattern
edition in web interface and at loading from configuration files.

Fixes issue #237
  • Loading branch information...
luccioman committed Oct 2, 2018
1 parent d42f079 commit ed93221fa1c179981727af793e01e74a4deeca33
@@ -37,6 +37,9 @@
import java.util.Map;
import java.util.Map.Entry;

import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.document.id.Punycode;
import net.yacy.cora.document.id.Punycode.PunycodeException;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.data.ListManager;
@@ -313,7 +316,10 @@ public static serverObjects respond(final RequestHeader header, final serverObje
for (final Entry<String, String> selectedEntry : selectedBlacklistEntries.entrySet()) {

final String editedEntryValue = editedBlacklistEntries.get(selectedEntry.getKey().replace("selectedBlacklistEntry.", "editedBlacklistEntry."));
if (!selectedEntry.getValue().equals(editedEntryValue)) {

final String preparedNewEntry = prepareNormalizedEntry(editedEntryValue);

if (!normalizeEntry(selectedEntry.getValue()).equals(preparedNewEntry)) {

/* Add first, to detect any eventual syntax errors before removing the old entry */
if (!BlacklistHelper.addBlacklistEntry(blacklistToUse, editedEntryValue, header)) {
@@ -540,4 +546,33 @@ public static serverObjects respond(final RequestHeader header, final serverObje
return prop;
}

/**
* @param entry a blacklist entry. Must not be null.
* @return a prepared and normalized entry as done internally in
* BlacklistHelper.addBlacklistEntry()
*/
private static String prepareNormalizedEntry(final String entry) {
return normalizeEntry(BlacklistHelper.prepareEntry(entry));
}

/**
* @param entry a blacklist entry. Must not be null.
* @return a normalized entry (punycode encoded host and percent-encoded path)
* as done internally in BlacklistHelper.addBlacklistEntry()
*/
private static String normalizeEntry(final String entry) {
final int slashPos = entry.indexOf('/', 0);
String host = entry.substring(0, slashPos);
try {
host = Punycode.isBasic(host) ? host : MultiProtocolURL.toPunycode(host);
} catch (final PunycodeException ignored) {
/*
* Punycode encoding error will be handled in
* BlacklistHelper.addBlacklistEntry()
*/
}
String path = MultiProtocolURL.escapePathPattern(entry.substring(slashPos + 1));
return host + "/" + path;
}

}
@@ -81,8 +81,22 @@
private static final Pattern patternMail = Pattern.compile("^[a-z]+:.*?");
//private static final Pattern patternSpace = Pattern.compile("%20");

private final static BitSet UNRESERVED_RFC1738 = new BitSet(128); // register unreserved chars (never escaped in url)
private final static BitSet UNRESERVED_PATH = new BitSet(128); // register unreserved chars for path part (not escaped in path)
/** Register unreserved chars (never escaped in url) */
private final static BitSet UNRESERVED_RFC1738 = new BitSet(128);

/** Register unreserved chars for path part (not escaped in path) */
private final static BitSet UNRESERVED_PATH = new BitSet(128);

/**
* Register regular expressions metacharacters used by the {@link Pattern}
* class.
*
* @see <a href=
* "https://docs.oracle.com/javase/tutorial/essential/regex/literals.html">Regular
* expressions string literals documentation</a>
*/
private static final BitSet PATTERN_METACHARACTERS = new BitSet(128);

static {
// unreserved characters (chars not to escape in url)
for (int i = 'A'; i <= 'Z'; i++) { // hialpha RFC1738 Section 5
@@ -119,6 +133,27 @@
UNRESERVED_PATH.set('@');
UNRESERVED_PATH.set('&');
UNRESERVED_PATH.set('=');

/* Pattern metacharacters : <([{\^-=$!|]})?*+.> */
PATTERN_METACHARACTERS.set('<');
PATTERN_METACHARACTERS.set('(');
PATTERN_METACHARACTERS.set('[');
PATTERN_METACHARACTERS.set('{');
PATTERN_METACHARACTERS.set('\\');
PATTERN_METACHARACTERS.set('^');
PATTERN_METACHARACTERS.set('-');
PATTERN_METACHARACTERS.set('=');
PATTERN_METACHARACTERS.set('$');
PATTERN_METACHARACTERS.set('!');
PATTERN_METACHARACTERS.set('|');
PATTERN_METACHARACTERS.set(']');
PATTERN_METACHARACTERS.set('}');
PATTERN_METACHARACTERS.set(')');
PATTERN_METACHARACTERS.set('?');
PATTERN_METACHARACTERS.set('*');
PATTERN_METACHARACTERS.set('+');
PATTERN_METACHARACTERS.set('.');
PATTERN_METACHARACTERS.set('>');
}

// session id handling
@@ -552,45 +587,135 @@ private static final String resolveBackpath(final String path) {
* </ul>
*/
private void escape() {
if (this.path != null && this.path.indexOf('%') == -1) escapePath();
if (this.path != null && this.path.indexOf('%') == -1) {
this.path = escapePath(this.path);
}
if (this.searchpart != null && this.searchpart.indexOf('%') == -1) escapeSearchpart();
if (this.anchor != null) this.anchor = escape(this.anchor).toString();
}

/**
* Url encode/escape the path part according to the allowed characters
* (RFC1738 & RFC2396)
* uses UTF-8 character codes for non-ASCII
*/
private void escapePath() {
final StringBuilder ptmp = new StringBuilder(this.path.length() + 10);
boolean modified = false;
final int len = this.path.length();
for (int i = 0; i < len; i++) {
int ch = this.path.charAt(i);
if (ch <= 0x7F) {
if (UNRESERVED_PATH.get(ch)) {
ptmp.append((char) ch);
} else {
ptmp.append(hex[ch]);
modified = true;
}
} else if (ch <= 0x07FF) { // non-ASCII <= 0x7FF
ptmp.append(hex[0xc0 | (ch >> 6)]);
ptmp.append(hex[0x80 | (ch & 0x3F)]);
modified = true;
} else { // 0x7FF < ch <= 0xFFFF
ptmp.append(hex[0xe0 | (ch >> 12)]);
ptmp.append(hex[0x80 | ((ch >> 6) & 0x3F)]);
ptmp.append(hex[0x80 | (ch & 0x3F)]);
modified = true;
}
}
if (modified) {
this.path = ptmp.toString();
}
/**
* <p>Percent-encode/escape an URL path part according to the allowed characters
* (see RFC3986, and formerly RFC1738 & RFC2396). Uses UTF-8 character codes for
* non-ASCII.</p>
* <p>Important : already percent-encoded characters are not re-encoded</p>
*
* @param pathToEscape the path part to escape.
* @return an escaped path with only ASCII characters, or null when pathToEscape
* is null.
* @see <a href="https://tools.ietf.org/html/rfc3986#section-2.1">RFC3986
* percent-encoding section</a>
* @see <z href="https://tools.ietf.org/html/rfc3986#appendix-A">RFC3986 path
* definition</a>
*/
public static String escapePath(final String pathToEscape) {
return escapePath(pathToEscape, false);
}

/**
* <p>Percent-encode/escape an URL path regular expression according to the allowed
* characters in an URL path (see RFC3986) and in the {@link Pattern} regular
* expressions. Uses UTF-8 character codes for non-ASCII.</p>
* <p>Important : already percent-encoded characters are not re-encoded</p>
*
* @param pathPattern the URL path regular expression to escape.
* @return an escaped path regular expression with only allowed ASCII
* characters, or null when pathPattern is null.
* @see <a href="https://tools.ietf.org/html/rfc3986#section-2.1">RFC3986
* percent-encoding section</a>
* @see <z href="https://tools.ietf.org/html/rfc3986#appendix-A">RFC3986 path
* definition</a>
*/
public static String escapePathPattern(final String pathPattern) {
return escapePath(pathPattern, true);
}

/**
* <p>
* Percent-encode/escape an URL path part according to the allowed characters
* specified in RFC3986 (formerly RFC1738 and RFC2396). Uses UTF-8 character
* codes for non-ASCII.
* </p>
* <p>
* When isPattern is true, the string is processed as a regular expression, and
* therefore meta-characters used by the {@link Pattern} class are not
* percent-encoded.
* </p>
*
* @param pathToEscape the path part to escape.
* @param isPattern when true, regular meta-characters are not escaped
* @return an escaped path regular expression with only allowed ASCII
* characters, or null when pathPattern is null.
* @see <a href="https://tools.ietf.org/html/rfc3986#section-2.1">RFC3986
* percent-encoding section</a>
* @see <z href="https://tools.ietf.org/html/rfc3986#appendix-A">RFC3986 path
* definition</a>
*/
private static String escapePath(final String pathToEscape, final boolean isPattern) {
if (pathToEscape == null) {
return pathToEscape;
}
final StringBuilder ptmp = new StringBuilder(pathToEscape.length() + 10);
boolean modified = false;
final int len = pathToEscape.length();
int i = 0;
while (i < len) {
int ch = pathToEscape.charAt(i);
if (ch == '%' && (i + 2) < len) {
final char digit1 = pathToEscape.charAt(i + 1);
final char digit2 = pathToEscape.charAt(i + 2);
if (isHexDigit(digit1) && isHexDigit(digit2)) {
/* Already percent-encoded character */
ptmp.append((char) ch);
/* Normalize hexadecimal digits to upper case */
if (Character.isLowerCase(digit1) || Character.isLowerCase(digit2)) {
modified = true;
}
ptmp.append(Character.toUpperCase(digit1));
ptmp.append(Character.toUpperCase(digit2));
i += 2;
} else {
/* Not a valid percent-encoded character : we encode it now */
ptmp.append(hex[ch]);
modified = true;
}
} else if (isPattern && PATTERN_METACHARACTERS.get(ch)) {
ptmp.append((char) ch);
} else if (ch <= 0x7F) {
if (UNRESERVED_PATH.get(ch)) {
ptmp.append((char) ch);
} else {
ptmp.append(hex[ch]);
modified = true;
}
} else if (ch <= 0x07FF) { // non-ASCII <= 0x7FF
ptmp.append(hex[0xc0 | (ch >> 6)]);
ptmp.append(hex[0x80 | (ch & 0x3F)]);
modified = true;
} else { // 0x7FF < ch <= 0xFFFF
ptmp.append(hex[0xe0 | (ch >> 12)]);
ptmp.append(hex[0x80 | ((ch >> 6) & 0x3F)]);
ptmp.append(hex[0x80 | (ch & 0x3F)]);
modified = true;
}
i++;
}

if (modified) {
return ptmp.toString();
}
return pathToEscape;
}

/**
* @param character a character to test
* @return true when the character is a valid hexadecimal digit
*/
private static boolean isHexDigit(final int character) {
return (character >= '0' && character <= '9') || (character >= 'a' && character <= 'f')
|| (character >= 'A' && character <= 'F');
}

private void escapeSearchpart() {
final StringBuilder qtmp = new StringBuilder(this.searchpart.length() + 10);
for (final Map.Entry<String, String> element: getAttributes().entrySet()) {
@@ -236,7 +236,10 @@ private void loadList(final BlacklistFile blFile, final String sep) {
log.warn("ignored blacklist path to prevent 'Dangling meta character' exception: " + a);
continue;
}
loadedPathsPattern.add(Pattern.compile(a, Pattern.CASE_INSENSITIVE)); // add case insesitive regex
/* We ensure now that any necessary percent-encoding is applied, as the blacklist file may have been manually edited.
* (when using the web interface, encoding should already have been applied in the add() function) */
final String normalizedPattern = MultiProtocolURL.escapePathPattern(a);
loadedPathsPattern.add(Pattern.compile(normalizedPattern, Pattern.CASE_INSENSITIVE)); // add case insesitive regex
}

// create new entry if host mask unknown, otherwise merge
@@ -348,8 +351,9 @@ public final void add(final BlacklistType blacklistType, final String blacklistT
final String host = itemToAdd.getHost();
final String path = itemToAdd.getPath();
final String safeHost = Punycode.isBasic(host) ? host : MultiProtocolURL.toPunycode(host);
final String safePath = MultiProtocolURL.escapePathPattern(path);

if (contains(blacklistType, safeHost, path)) {
if (contains(blacklistType, safeHost, safePath)) {
/* Continue to the next item */
continue;
}
@@ -364,7 +368,7 @@ public final void add(final BlacklistType blacklistType, final String blacklistT
continue;
}

String p = (!path.isEmpty() && path.charAt(0) == '/') ? path.substring(1) : path;
String p = (!safePath.isEmpty() && safePath.charAt(0) == '/') ? safePath.substring(1) : safePath;
final Map<String, Set<Pattern>> blacklistMap = getBlacklistMap(blacklistType, isMatchable(host));

// avoid PatternSyntaxException e
@@ -376,7 +380,7 @@ public final void add(final BlacklistType blacklistType, final String blacklistT

Set<Pattern> hostList;
if (!(blacklistMap.containsKey(h) && ((hostList = blacklistMap.get(h)) != null))) {
blacklistMap.put(h, (hostList = new HashSet<Pattern>()));
blacklistMap.put(h, (hostList = new HashSet<>()));
}

Pattern pattern = Pattern.compile(p, Pattern.CASE_INSENSITIVE);
@@ -438,6 +442,7 @@ public final void add (final String blacklistSourcefile, final String host, fina
}

String p = (!path.isEmpty() && path.charAt(0) == '/') ? path.substring(1) : path;
p = MultiProtocolURL.escapePathPattern(p);

// avoid PatternSyntaxException e
String h = ((!isMatchable(host) && !host.isEmpty() && host.charAt(0) == '*') ? "." + host : host).toLowerCase(Locale.ROOT);
@@ -29,7 +29,7 @@ private BlacklistHelper() {
* @param entry a blacklist entry. Must not be null.
* @return the entry eventually modified to be ready to use by the Blacklist engine
*/
protected static String prepareEntry(final String entry) {
public static String prepareEntry(final String entry) {
String newEntry = entry;
/* Remove the eventual unnecessary Regex line beginning char '^' and URL scheme (protocol) part */
Matcher schemeMatcher = URL_SCHEME_PATTERN.matcher(newEntry);

0 comments on commit ed93221

Please sign in to comment.
You can’t perform that action at this time.