Permalink
Browse files

Made "tld:" modifier case insensitive and IDN complient.

Thus allowing typing internationalized top-level domains with non ASCII
characters as tld: modifier.
  • Loading branch information...
luccioman committed Dec 4, 2017
1 parent a4494d6 commit f9cba827c03caf9a2e2c6a5a286bb7294f70b5df
Showing with 24 additions and 6 deletions.
  1. +24 −6 htroot/yacysearch.java
@@ -29,13 +29,15 @@
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.IDN;
import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Collection;
import java.util.ConcurrentModificationException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Locale;
import java.util.Map;
import java.util.SortedSet;
import java.util.TreeSet;
@@ -497,18 +499,34 @@ public static serverObjects respond(
modifier.add("/heuristic");
}
final int tldp = querystring.indexOf("tld:", 0);
final String tldModifierPrefix = "tld:";
final int tldp = querystring.indexOf(tldModifierPrefix, 0);
if (tldp >= 0) {
int ftb = querystring.indexOf(' ', tldp);
if (ftb == -1) ftb = querystring.length();
tld = querystring.substring(tldp + 4, ftb);
querystring = querystring.replace("tld:" + tld, "");
modifier.add("tld:" + tld);
if (ftb == -1) {
ftb = querystring.length();
}
tld = querystring.substring(tldp + tldModifierPrefix.length(), ftb);
querystring = querystring.replace(tldModifierPrefix + tld, "");
modifier.add(tldModifierPrefix + tld);
while ( tld.length() > 0 && tld.charAt(0) == '.' ) {
tld = tld.substring(1);
}
if (tld.length() == 0) tld = null;
if (tld.length() == 0) {
tld = null;
} else {
try {
/* Convert to the same lower case ASCII Compatible Encoding that is used in normalized URLs */
tld = IDN.toASCII(tld, 0);
} catch(final IllegalArgumentException e){
ConcurrentLog.warn("LOCAL_SEARCH", "Failed to convert tld modifier value " + tld + "to ASCII Compatible Encoding (ACE)", e);
}
/* Domain name in an URL is case insensitive : convert now modifier to lower case for further processing over normalized URLs */
tld = tld.toLowerCase(Locale.ROOT);
}
}
if (urlmask == null || urlmask.isEmpty()) urlmask = ".*"; //if no urlmask was given
// read the language from the language-restrict option 'lr'

0 comments on commit f9cba82

Please sign in to comment.