Permalink
Browse files

Do locale independant case conversion on hosts, schemes, and file exts.

Required for proper operation when the default system locale is Turkish,
as dottless and dotted i characters have specific case conversion rules
in this language.
  • Loading branch information...
luccioman committed Dec 19, 2017
1 parent 1c4803e commit 5db1c9155a296719fcd79caef4cec850f8e26709
@@ -2,6 +2,7 @@
import java.net.MalformedURLException;
import java.util.EnumMap;
import java.util.Iterator;
import java.util.Locale;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.protocol.ClientIdentification;
@@ -38,7 +39,7 @@ public static serverObjects respond(final RequestHeader header, final serverObje
String url = post.get(YMarkEntry.BOOKMARK.URL.key(),YMarkEntry.BOOKMARK.URL.deflt());
boolean hasProtocol = false;
for (final YMarkTables.PROTOCOLS p : YMarkTables.PROTOCOLS.values()) {
if(url.toLowerCase().startsWith(p.protocol())) {
if(url.toLowerCase(Locale.ROOT).startsWith(p.protocol())) {
hasProtocol = true;
break;
}
@@ -30,6 +30,7 @@
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
@@ -253,7 +254,7 @@ public void write(final Writer writer, final SolrQueryRequest request, final Sol
writer.write("<media:content medium=\"image\" url=\"");
XML.escapeCharData(imageurl, writer); writer.write("\"/>\n");
} else {
if (url != null && Response.docTypeExt(MultiProtocolURL.getFileExtension(url.getFile()).toLowerCase()) == Response.DT_IMAGE) {
if (url != null && Response.docTypeExt(MultiProtocolURL.getFileExtension(url.getFile()).toLowerCase(Locale.ROOT)) == Response.DT_IMAGE) {
writer.write("<media:content medium=\"image\" url=\"");
XML.escapeCharData(url.toNormalform(true), writer); writer.write("\"/>\n");
}
@@ -28,6 +28,7 @@
import java.util.HashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import net.yacy.cora.document.id.MultiProtocolURL;
@@ -220,7 +221,7 @@ public void write(final Writer writer, final SolrQueryRequest request, final Sol
String imageurl = images_protocol.get(0) + "://" + images_stub.get(0);
solitaireTag(writer, "image", imageurl);
} else {
if (url != null && Response.docTypeExt(MultiProtocolURL.getFileExtension(url.getFile()).toLowerCase()) == Response.DT_IMAGE) {
if (url != null && Response.docTypeExt(MultiProtocolURL.getFileExtension(url.getFile()).toLowerCase(Locale.ROOT)) == Response.DT_IMAGE) {
solitaireTag(writer, "image", url.toNormalform(true));
}
}
@@ -32,6 +32,7 @@
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
@@ -717,7 +718,7 @@ public static String siteFilter(final Collection<? extends MultiProtocolURL> url
String host = url.getHost();
if (host == null) continue;
if (host.startsWith("www.")) host = host.substring(4);
filter.append(Pattern.quote(host.toLowerCase())).append(".*|");
filter.append(Pattern.quote(host.toLowerCase(Locale.ROOT))).append(".*|");
}
filter.setCharAt(filter.length() - 1, ')');
return filter.toString();
@@ -746,7 +747,7 @@ public static String mustMatchSubpath(final MultiProtocolURL url) {
if (host.startsWith("www.")) host = host.substring(4);
String protocol = url.getProtocol();
if ("http".equals(protocol) || "https".equals(protocol)) protocol = "https?+";
return new StringBuilder(host.length() + 20).append(protocol).append("://(www.)?").append(Pattern.quote(host.toLowerCase())).append(url.getPath()).append(".*").toString();
return new StringBuilder(host.length() + 20).append(protocol).append("://(www.)?").append(Pattern.quote(host.toLowerCase(Locale.ROOT))).append(url.getPath()).append(".*").toString();
}
public boolean isPushCrawlProfile() {
@@ -27,6 +27,7 @@
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Locale;
import org.apache.http.HttpStatus;
import org.apache.http.StatusLine;
@@ -119,7 +120,7 @@ public StreamResponse openInputStream(final Request request, CrawlProfile profil
port = (ssl) ? 443 : 80;
// check if url is in blacklist
final String hostlow = host.toLowerCase();
final String hostlow = host.toLowerCase(Locale.ROOT);
if (blacklistType != null && Switchboard.urlBlacklist.isListed(blacklistType, hostlow, path)) {
this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_LOAD_CONTEXT,
"url in blacklist", -1);
@@ -337,7 +338,7 @@ private Response load(final Request request, CrawlProfile profile, final int ret
if (port < 0) port = (ssl) ? 443 : 80;
// check if url is in blacklist
final String hostlow = host.toLowerCase();
final String hostlow = host.toLowerCase(Locale.ROOT);
if (blacklistType != null && Switchboard.urlBlacklist.isListed(blacklistType, hostlow, path)) {
this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
throw new IOException("CRAWLER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.$");
@@ -466,7 +467,7 @@ private static Response load(final Request request, ClientIdentification.Agent a
if (port < 0) port = (ssl) ? 443 : 80;
// check if url is in blacklist
final String hostlow = host.toLowerCase();
final String hostlow = host.toLowerCase(Locale.ROOT);
if (Switchboard.urlBlacklist != null && Switchboard.urlBlacklist.isListed(BlacklistType.CRAWLER, hostlow, path)) {
throw new IOException("CRAWLER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.");
}
@@ -33,6 +33,7 @@
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import net.yacy.cora.document.encoding.ASCII;
@@ -62,7 +63,7 @@
private String info; // this is filled if robots disallowed access; then the reason is noted there;
protected RobotsTxtEntry(final String hostName, final Map<String, byte[]> mem) {
this.hostName = hostName.toLowerCase();
this.hostName = hostName.toLowerCase(Locale.ROOT);
this.mem = mem;
this.info = "";
@@ -100,7 +101,7 @@ protected RobotsTxtEntry(
) {
if (theURL == null) throw new IllegalArgumentException("The url is missing");
this.hostName = RobotsTxt.getHostPort(theURL).toLowerCase();
this.hostName = RobotsTxt.getHostPort(theURL).toLowerCase(Locale.ROOT);
this.allowPathList = new LinkedList<String>();
this.denyPathList = new LinkedList<String>();
this.sitemapList = new LinkedList<String>();
@@ -34,6 +34,7 @@
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
@@ -982,7 +983,7 @@ public String resolve(String host) {
host = host.substring(p + 1); // if ever, the double-dots are here but do not harm
}
// identify domain
final String domain = host.substring(0, host.length() - 5).toLowerCase();
final String domain = host.substring(0, host.length() - 5).toLowerCase(Locale.ROOT);
seed = lookupByName(domain);
if (seed == null) return null;
if (this.mySeed == null) initMySeed();
@@ -38,6 +38,7 @@
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
@@ -70,7 +71,7 @@
@Override
public final String toString () {
return super.toString().toLowerCase();
return super.toString().toLowerCase(Locale.ROOT);
}
}
@@ -367,7 +368,7 @@ public final void add(final BlacklistType blacklistType, final String blacklistT
// avoid PatternSyntaxException e
final String h = ((!isMatchable(safeHost) && !safeHost.isEmpty() && safeHost.charAt(0) == '*')
? "." + safeHost : safeHost).toLowerCase();
? "." + safeHost : safeHost).toLowerCase(Locale.ROOT);
if (!p.isEmpty() && p.charAt(0) == '*') {
p = "." + p;
}
@@ -436,7 +437,7 @@ public final void add (final String blacklistSourcefile, final String host, fina
String p = (!path.isEmpty() && path.charAt(0) == '/') ? path.substring(1) : path;
// avoid PatternSyntaxException e
String h = ((!isMatchable(host) && !host.isEmpty() && host.charAt(0) == '*') ? "." + host : host).toLowerCase();
String h = ((!isMatchable(host) && !host.isEmpty() && host.charAt(0) == '*') ? "." + host : host).toLowerCase(Locale.ROOT);
h = Punycode.isBasic(h) ? h : MultiProtocolURL.toPunycode(h);
@@ -516,7 +517,7 @@ public final boolean contains(final BlacklistType blacklistType, final String ho
final Map<String, Set<Pattern>> blacklistMap = getBlacklistMap(blacklistType, isMatchable(host));
// avoid PatternSyntaxException e
final String h = ((!isMatchable(host) && !host.isEmpty() && host.charAt(0) == '*') ? "." + host : host).toLowerCase();
final String h = ((!isMatchable(host) && !host.isEmpty() && host.charAt(0) == '*') ? "." + host : host).toLowerCase(Locale.ROOT);
final Set<Pattern> hostList = blacklistMap.get(h);
if (hostList != null) {
@@ -549,7 +550,7 @@ public final boolean isListed(final BlacklistType blacklistType, final DigestURL
HandleSet urlHashCache = getCacheUrlHashsSet(blacklistType);
if (urlHashCache == null) {
urlHashCache = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 0);
if (isListed(blacklistType, url.getHost().toLowerCase(), url.getFile())) {
if (isListed(blacklistType, url.getHost().toLowerCase(Locale.ROOT), url.getFile())) {
try {
urlHashCache.put(url.hash());
} catch (final SpaceExceededException e) {
@@ -559,7 +560,7 @@ public final boolean isListed(final BlacklistType blacklistType, final DigestURL
}
}
if (!urlHashCache.has(url.hash())) {
final boolean temp = isListed(blacklistType, url.getHost().toLowerCase(), url.getFile());
final boolean temp = isListed(blacklistType, url.getHost().toLowerCase(Locale.ROOT), url.getFile());
if (temp) {
try {
urlHashCache.put(url.hash());
@@ -4,6 +4,7 @@
import java.io.IOException;
import java.util.EnumSet;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
@@ -77,7 +78,7 @@ public void add(final String entry, final EnumSet<listTypes> types) {
assert entry != null;
int pos; // position between domain and path
if((pos = entry.indexOf('/')) > 0) {
String host = entry.substring(0, pos).trim().toLowerCase();
String host = entry.substring(0, pos).trim().toLowerCase(Locale.ROOT);
final String path = entry.substring(pos + 1).trim();
// avoid PatternSyntaxException e
@@ -123,7 +124,7 @@ public boolean isListed(final DigestURL url, final EnumSet<listTypes> type) {
return e.containsAll(type);
}
// Cache Miss
return isListed(url.getHost().toLowerCase(), url.getFile());
return isListed(url.getHost().toLowerCase(Locale.ROOT), url.getFile());
}
public static boolean isMatchable (final String host) {
@@ -34,6 +34,7 @@
import java.util.Date;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Locale;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.Semaphore;
@@ -94,7 +95,7 @@ public LoaderDispatcher(final Switchboard sb) {
public boolean isSupportedProtocol(final String protocol) {
if ((protocol == null) || (protocol.isEmpty())) return false;
return this.supportedProtocols.contains(protocol.trim().toLowerCase());
return this.supportedProtocols.contains(protocol.trim().toLowerCase(Locale.ROOT));
}
@SuppressWarnings("unchecked")
@@ -208,7 +209,7 @@ private Response loadInternal(final Request request, CacheStrategy cacheStrategy
final CrawlProfile crawlProfile = request.profileHandle() == null ? null : this.sb.crawler.get(UTF8.getBytes(request.profileHandle()));
// check if url is in blacklist
if (blacklistType != null && host != null && Switchboard.urlBlacklist.isListed(blacklistType, host.toLowerCase(), url.getFile())) {
if (blacklistType != null && host != null && Switchboard.urlBlacklist.isListed(blacklistType, host.toLowerCase(Locale.ROOT), url.getFile())) {
this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), crawlProfile, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
throw new IOException("DISPATCHER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.$");
}
@@ -362,7 +363,7 @@ private StreamResponse openInputStreamInternal(final Request request, CacheStrat
final CrawlProfile crawlProfile = request.profileHandle() == null ? null : this.sb.crawler.get(UTF8.getBytes(request.profileHandle()));
// check if url is in blacklist
if (blacklistType != null && host != null && Switchboard.urlBlacklist.isListed(blacklistType, host.toLowerCase(), url.getFile())) {
if (blacklistType != null && host != null && Switchboard.urlBlacklist.isListed(blacklistType, host.toLowerCase(Locale.ROOT), url.getFile())) {
this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), crawlProfile, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
throw new IOException("DISPATCHER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.$");
}
@@ -29,6 +29,7 @@
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.SortedMap;
import java.util.SortedSet;
@@ -257,7 +258,7 @@ private static HandleSet removeAppearanceHashes(final String sentence, final Han
*/
private static boolean isUrlBlacklisted (final BlacklistType blacklistType, final DigestURL url) {
final boolean isBlacklisted = Switchboard.urlBlacklist.isListed(blacklistType, url.getHost().toLowerCase(), url.getFile());
final boolean isBlacklisted = Switchboard.urlBlacklist.isListed(blacklistType, url.getHost().toLowerCase(Locale.ROOT), url.getFile());
if (isBlacklisted) {
@@ -62,6 +62,7 @@
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Locale;
import java.util.logging.FileHandler;
import java.util.logging.Level;
import java.util.logging.LogManager;
@@ -314,7 +315,7 @@ public static void doGet(final HashMap<String, Object> conProp, final RequestHea
// handle outgoing cookies
handleOutgoingCookies(requestHeader, url.getHost(), ip);
prepareRequestHeader(conProp, requestHeader, url.getHost().toLowerCase());
prepareRequestHeader(conProp, requestHeader, url.getHost().toLowerCase(Locale.ROOT));
final ResponseHeader cachedResponseHeader = Cache.getResponseHeader(url.hash());
// why are files unzipped upon arrival? why not zip all files in cache?
@@ -1042,7 +1043,7 @@ private static serverObjects unknownHostHandling(final HashMap<String, Object> c
int orgHostPort = orgurl.getPort();
String orgHostName = orgurl.getHost();
if (orgHostName == null) orgHostName = "unknown";
orgHostName = orgHostName.toLowerCase();
orgHostName = orgHostName.toLowerCase(Locale.ROOT);
String orgHostPath = orgurl.getPath(); if (orgHostPath == null) orgHostPath = "";
String orgHostArgs = orgurl.getSearchpart();; if (orgHostArgs == null) orgHostArgs = "";
if (orgHostArgs.length() > 0) orgHostArgs = "?" + orgHostArgs;

0 comments on commit 5db1c91

Please sign in to comment.