Permalink
Browse files

Ensure lower case conversion consistency with any default locale.

Especially for Turkish speaking users using "tr" as their system default
locale : strings for technical stuff (URLs, tag names, constants...)
must not be lower cased with the default locale, as 'I' doesn't becomes
'i' like in other locales such as "en", but becomes 'ı'.
  • Loading branch information...
luccioman committed Jun 27, 2017
1 parent 286f301 commit 8da3174867b503aae21643ccb032ff43a6647ec3
Showing with 95 additions and 60 deletions.
  1. +2 −1 htroot/ConfigHeuristics_p.java
  2. +3 −2 htroot/Crawler_p.java
  3. +3 −1 htroot/api/blacklists/add_entry_p.java
  4. +3 −1 htroot/api/blacklists/delete_entry_p.java
  5. +2 −1 htroot/api/getpageinfo_p.java
  6. +2 −1 htroot/api/ymarks/add_ymark.java
  7. +1 −1 source/net/yacy/cora/protocol/RequestHeader.java
  8. +2 −1 source/net/yacy/cora/protocol/http/HTTPClient.java
  9. +2 −1 source/net/yacy/crawler/retrieval/Response.java
  10. +3 −2 source/net/yacy/data/wiki/WikiCode.java
  11. +11 −10 source/net/yacy/document/Document.java
  12. +3 −2 source/net/yacy/document/content/SurrogateReader.java
  13. +3 −2 source/net/yacy/document/parser/html/TransformerWriter.java
  14. +3 −1 source/net/yacy/document/parser/rdfa/impl/RDFaTripleImpl.java
  15. +2 −1 source/net/yacy/gui/framework/Browser.java
  16. +2 −1 source/net/yacy/http/AbstractRemoteHandler.java
  17. +3 −2 source/net/yacy/http/servlets/YaCyDefaultServlet.java
  18. +3 −2 source/net/yacy/kelondro/data/meta/URIMetadataNode.java
  19. +1 −1 source/net/yacy/kelondro/util/Formatter.java
  20. +4 −3 source/net/yacy/kelondro/util/ISO639.java
  21. +2 −1 source/net/yacy/kelondro/util/OS.java
  22. +2 −1 source/net/yacy/peers/Network.java
  23. +2 −1 source/net/yacy/peers/operation/yacyRelease.java
  24. +3 −2 source/net/yacy/search/index/Segment.java
  25. +2 −1 source/net/yacy/search/query/QueryModifier.java
  26. +2 −1 source/net/yacy/search/query/QueryParams.java
  27. +4 −3 source/net/yacy/search/schema/CollectionConfiguration.java
  28. +2 −1 source/net/yacy/search/schema/CollectionSchema.java
  29. +2 −1 source/net/yacy/search/schema/WebgraphSchema.java
  30. +2 −1 source/net/yacy/server/serverObjects.java
  31. +2 −1 source/net/yacy/utils/translation/TranslatorXliff.java
  32. +8 −7 source/net/yacy/yacy.java
  33. +4 −2 test/java/net/yacy/document/parser/htmlParserTest.java
@@ -39,6 +39,7 @@
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.Iterator;
import java.util.Locale;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.federate.FederateSearchManager;
@@ -101,7 +102,7 @@ public static serverObjects respond(@SuppressWarnings("unused") final RequestHea
// add new entry to config file
final String tmpname = post.get("ossys_newtitle");
if (tmpname != null && tmpurl !=null) {
if (!tmpname.isEmpty() && !tmpurl.isEmpty() && tmpurl.toLowerCase().contains("{searchterms}")) {
if (!tmpname.isEmpty() && !tmpurl.isEmpty() && tmpurl.toLowerCase(Locale.ROOT).contains("{searchterms}")) {
/* Check eventual robots.txt policy */
RobotsTxtEntry robotsEntry = null;
try {
@@ -29,6 +29,7 @@
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
@@ -683,8 +684,8 @@ public static serverObjects respond(final RequestHeader header, final serverObje
try {
wantedPPM = post.getInt("customPPM", wantedPPM);
} catch (final NumberFormatException e) {}
if ("minimum".equals(crawlingPerformance.toLowerCase())) wantedPPM = 10;
if ("maximum".equals(crawlingPerformance.toLowerCase())) wantedPPM = 30000;
if ("minimum".equals(crawlingPerformance.toLowerCase(Locale.ROOT))) wantedPPM = 10;
if ("maximum".equals(crawlingPerformance.toLowerCase(Locale.ROOT))) wantedPPM = 30000;
int wPPM = wantedPPM;
if ( wPPM <= 0 ) {
@@ -1,3 +1,5 @@
import java.util.Locale;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.data.ListManager;
import net.yacy.data.WorkTables;
@@ -25,7 +27,7 @@ public static serverObjects respond(final RequestHeader header, final serverObje
ListManager.switchboard.tables.recordAPICall(
post,
"add_entry_p." + header.fileType().toString().toLowerCase(),
"add_entry_p." + header.fileType().toString().toLowerCase(Locale.ROOT),
WorkTables.TABLE_API_TYPE_CONFIGURATION,
"add to blacklist '" + blacklistToUse + "': " + entry);
@@ -1,3 +1,5 @@
import java.util.Locale;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.data.ListManager;
import net.yacy.data.WorkTables;
@@ -26,7 +28,7 @@ public static serverObjects respond(final RequestHeader header, final serverObje
// store this call as api call
ListManager.switchboard.tables.recordAPICall(
post,
"delete_entry_p." + header.fileType().toString().toLowerCase(),
"delete_entry_p." + header.fileType().toString().toLowerCase(Locale.ROOT),
WorkTables.TABLE_API_TYPE_CONFIGURATION,
"delete from blacklist '" + blacklistToUse + "': " + entry);
@@ -30,6 +30,7 @@
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Set;
import javax.xml.parsers.DocumentBuilder;
@@ -116,7 +117,7 @@ public static serverObjects respond(final RequestHeader header, final serverObje
if (post.containsKey("actions"))
actions=post.get("actions");
String url=post.get("url");
if (url.toLowerCase().startsWith("ftp://")) {
if (url.toLowerCase(Locale.ROOT).startsWith("ftp://")) {
prop.put("robots-allowed", "1"); // ok to crawl
prop.put("robotsInfo", "ftp does not follow robots.txt");
prop.putXML("title", "FTP: " + url);
@@ -1,4 +1,5 @@
import java.io.IOException;
import java.util.Locale;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.protocol.ClientIdentification;
@@ -51,7 +52,7 @@ public static serverObjects respond(final RequestHeader header, final serverObje
String url = post.get(YMarkEntry.BOOKMARK.URL.key(),YMarkEntry.BOOKMARK.URL.deflt());
boolean hasProtocol = false;
for (final YMarkTables.PROTOCOLS p : YMarkTables.PROTOCOLS.values()) {
if(url.toLowerCase().startsWith(p.protocol())) {
if(url.toLowerCase(Locale.ROOT).startsWith(p.protocol())) {
hasProtocol = true;
break;
}
@@ -129,7 +129,7 @@ public Date ifModifiedSince() {
public FileType fileType() {
String path = this.getPathInfo();
if (path == null) return FileType.HTML;
path = path.toLowerCase();
path = path.toLowerCase(Locale.ROOT);
if (path.endsWith(".json")) return FileType.JSON;
if (path.endsWith(".xml")) return FileType.XML;
if (path.endsWith(".rdf")) return FileType.XML;
@@ -34,6 +34,7 @@
import java.security.NoSuchAlgorithmException;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import java.util.Locale;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
@@ -664,7 +665,7 @@ public String getMimeType() {
mimeType = contentType.getValue();
if (mimeType != null) {
mimeType = mimeType.trim().toLowerCase();
mimeType = mimeType.trim().toLowerCase(Locale.ROOT);
final int pos = mimeType.indexOf(';');
if(pos >= 0) {
@@ -28,6 +28,7 @@
import java.nio.charset.StandardCharsets;
import java.util.Date;
import java.util.Locale;
import net.yacy.cora.document.analysis.Classification;
import net.yacy.cora.document.encoding.ASCII;
@@ -787,7 +788,7 @@ public String getMimeType() {
String mimeType = this.responseHeader.getContentType();
if (mimeType != null) {
mimeType = mimeType.trim().toLowerCase();
mimeType = mimeType.trim().toLowerCase(Locale.ROOT);
final int pos = mimeType.indexOf(';');
return ((pos < 0) ? mimeType : mimeType.substring(0, pos));
@@ -30,6 +30,7 @@
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.regex.Pattern;
@@ -1069,7 +1070,7 @@ protected static String processMetadata(final String line) {
}
if(closeIndex > 0) {
final String content = processedLine.substring(openIndex + LEN_WIKI_OPEN_METADATA, closeIndex);
if (content.toLowerCase().startsWith("coordinate")) {
if (content.toLowerCase(Locale.ROOT).startsWith("coordinate")) {
// parse Geographical Coordinates as described in
// http://en.wikipedia.org/wiki/Wikipedia:Manual_of_Style_%28dates_and_numbers%29#Geographical_coordinates
// looks like:
@@ -1087,7 +1088,7 @@ protected static String processMetadata(final String line) {
String name = "";
try {
for (final String c : b) {
if (c.toLowerCase().startsWith("name=")) {
if (c.toLowerCase(Locale.ROOT).startsWith("name=")) {
name = c.substring(5);
}
if (c.toUpperCase().startsWith("NS=")) {
@@ -44,6 +44,7 @@
import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
@@ -565,8 +566,8 @@ private void resortLinks() {
continue;
}
final boolean noindex = url.getRelProperty().toLowerCase().indexOf("noindex",0) >= 0;
final boolean nofollow = url.getRelProperty().toLowerCase().indexOf("nofollow",0) >= 0;
final boolean noindex = url.getRelProperty().toLowerCase(Locale.ROOT).indexOf("noindex",0) >= 0;
final boolean nofollow = url.getRelProperty().toLowerCase(Locale.ROOT).indexOf("nofollow",0) >= 0;
if ((thishost == null && url.getHost() == null) ||
((thishost != null && url.getHost() != null) &&
(url.getHost().endsWith(thishost) ||
@@ -578,9 +579,9 @@ private void resortLinks() {
extpos = u.lastIndexOf('.');
if (extpos > 0) {
if (((qpos = u.indexOf('?')) >= 0) && (qpos > extpos)) {
ext = u.substring(extpos + 1, qpos).toLowerCase();
ext = u.substring(extpos + 1, qpos).toLowerCase(Locale.ROOT);
} else {
ext = u.substring(extpos + 1).toLowerCase();
ext = u.substring(extpos + 1).toLowerCase(Locale.ROOT);
}
if (Classification.isMediaExtension(ext)) {
// this is not a normal anchor, its a media link
@@ -705,10 +706,10 @@ else if (o instanceof IconEntry)
u = url.toNormalform(true);
// find start of a referenced http url
if ((pos = u.toLowerCase().indexOf("http://", 7)) > 0) { // 7 = skip the protocol part of the source url
if ((pos = u.toLowerCase(Locale.ROOT).indexOf("http://", 7)) > 0) { // 7 = skip the protocol part of the source url
i.remove();
u = u.substring(pos);
while ((pos = u.toLowerCase().indexOf("http://", 7)) > 0)
while ((pos = u.toLowerCase(Locale.ROOT).indexOf("http://", 7)) > 0)
u = u.substring(pos);
url = new AnchorURL(u);
if (!(v.containsKey(url)))
@@ -717,21 +718,21 @@ else if (o instanceof IconEntry)
}
// find start of a referenced https url
if ((pos = u.toLowerCase().indexOf("https://", 7)) > 0) { // 7 = skip the protocol part of the source url
if ((pos = u.toLowerCase(Locale.ROOT).indexOf("https://", 7)) > 0) { // 7 = skip the protocol part of the source url
i.remove();
u = u.substring(pos);
while ((pos = u.toLowerCase().indexOf("https://", 7)) > 0)
while ((pos = u.toLowerCase(Locale.ROOT).indexOf("https://", 7)) > 0)
u = u.substring(pos);
url = new AnchorURL(u);
if (!(v.containsKey(url)))
v.put(url, "ref");
continue loop;
}
if ((pos = u.toLowerCase().indexOf("/www.", 11)) > 0) { // 11 = skip protocol part + www of source url "http://www."
if ((pos = u.toLowerCase(Locale.ROOT).indexOf("/www.", 11)) > 0) { // 11 = skip protocol part + www of source url "http://www."
i.remove();
u = url.getProtocol()+":/" + u.substring(pos);
while ((pos = u.toLowerCase().indexOf("/www.", 11)) > 0)
while ((pos = u.toLowerCase(Locale.ROOT).indexOf("/www.", 11)) > 0)
u = url.getProtocol()+":/" + u.substring(pos);
AnchorURL addurl = new AnchorURL(u);
@@ -32,6 +32,7 @@
import java.io.StringReader;
import java.net.MalformedURLException;
import java.nio.charset.StandardCharsets;
import java.util.Locale;
import java.util.Map;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
@@ -220,7 +221,7 @@ private boolean isSolrDump() {
@Override
public void startElement(final String uri, final String name, String tag, final Attributes atts) throws SAXException {
if (tag == null) return;
tag = tag.toLowerCase();
tag = tag.toLowerCase(Locale.ROOT);
if ("record".equals(tag) || "document".equals(tag) || "doc".equals(tag)) {
this.dcEntry = new DCEntry();
} else if ("element".equals(tag) || "str".equals(tag) || "int".equals(tag) || "bool".equals(tag) || "long".equals(tag)) {
@@ -239,7 +240,7 @@ public void startElement(final String uri, final String name, String tag, final
@Override
public void endElement(final String uri, final String name, String tag) {
if (tag == null) return;
tag = tag.toLowerCase();
tag = tag.toLowerCase(Locale.ROOT);
if ("record".equals(tag) || "document".equals(tag) || "doc".equals(tag)) {
try {
// check if url is in accepted domain
@@ -37,6 +37,7 @@
import java.io.Writer;
import java.nio.charset.Charset;
import java.util.Enumeration;
import java.util.Locale;
import java.util.Properties;
import java.util.Stack;
import net.yacy.document.parser.html.ContentScraper.TagName;
@@ -194,7 +195,7 @@ public TransformerWriter(
if (in[1] == '/') {
// a closing tag
tagend = tagEnd(in, 2);
tag = new String(in, 2, tagend - 2).toLowerCase();
tag = new String(in, 2, tagend - 2).toLowerCase(Locale.ROOT);
final char[] text = new char[in.length - tagend - 1];
System.arraycopy(in, tagend, text, 0, in.length - tagend - 1);
return filterTag(text, quotechar, tag, false);
@@ -207,7 +208,7 @@ public TransformerWriter(
// an opening tag
tagend = tagEnd(in, 1);
tag = new String(in, 1, tagend - 1).toLowerCase();
tag = new String(in, 1, tagend - 1).toLowerCase(Locale.ROOT);
final char[] text = new char[in.length - tagend - 1];
System.arraycopy(in, tagend, text, 0, in.length - tagend - 1);
return filterTag(text, quotechar, tag, true);
@@ -5,6 +5,8 @@
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Locale;
import javax.xml.transform.Templates;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerConfigurationException;
@@ -38,7 +40,7 @@ public RDFaTripleImpl(Reader in, String base) throws IOException,
BufferedReader bufReader = new BufferedReader(in);
bufReader.mark(2048); // mark position for following reset
String readLine = bufReader.readLine();
if (!readLine.toLowerCase().contains("<!doctype")){
if (!readLine.toLowerCase(Locale.ROOT).contains("<!doctype")){
bufReader.reset();
}
@@ -30,6 +30,7 @@
import java.io.InputStreamReader;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.Locale;
import java.util.Properties;
import net.yacy.cora.util.ConcurrentLog;
@@ -61,7 +62,7 @@
static {
// check operation system type
final Properties sysprop = System.getProperties();
final String sysname = sysprop.getProperty("os.name","").toLowerCase();
final String sysname = sysprop.getProperty("os.name","").toLowerCase(Locale.ROOT);
if (sysname.startsWith("mac os x")) {
systemOS = systemMacOSX;
} else if (sysname.startsWith("mac os")) {
@@ -27,6 +27,7 @@
import java.io.IOException;
import java.net.InetAddress;
import java.util.HashSet;
import java.util.Locale;
import java.util.Set;
import java.util.StringTokenizer;
@@ -143,7 +144,7 @@ public void handle(String target, Request baseRequest, HttpServletRequest reques
}
// check the blacklist
if (Switchboard.urlBlacklist.isListed(BlacklistType.PROXY, hostOnly.toLowerCase(), request.getPathInfo())) {
if (Switchboard.urlBlacklist.isListed(BlacklistType.PROXY, hostOnly.toLowerCase(Locale.ROOT), request.getPathInfo())) {
response.sendError(HttpServletResponse.SC_FORBIDDEN,
"URL '" + hostOnly + "' blocked by yacy proxy (blacklisted)");
baseRequest.setHandled(true);
@@ -39,6 +39,7 @@
import java.util.Enumeration;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.Callable;
@@ -705,15 +706,15 @@ public static String getContext(final RequestHeader header, final Switchboard sb
/* Let's check this header has a valid value */
if("http".equals(protocolHeader) || "https".equals(protocolHeader)) {
protocol = protocolHeader.toLowerCase();
protocol = protocolHeader.toLowerCase(Locale.ROOT);
} else if(protocolHeader != null && !protocolHeader.isEmpty()) {
ConcurrentLog.warn("FILEHANDLER","YaCyDefaultServlet: illegal protocol scheme header value : " + protocolHeader);
}
/* This peer can also be behind a reverse proxy requested using https, even if the request coming to this YaCy peer is http only
* Possible scenario (happens for example when YaCy is deployed on Heroku Platform) : User browser -> https://reverseProxy/yacyURL -> http://yacypeer/yacyURL
* In that case, absolute URLs rendered by this peer (in rss feeds for example) must effectively start with the https scheme */
protocolHeader = header.get(HttpHeaders.X_FORWARDED_PROTO.toString(), "").toLowerCase();
protocolHeader = header.get(HttpHeaders.X_FORWARDED_PROTO.toString(), "").toLowerCase(Locale.ROOT);
/* Here we only allow an upgrade from HTTP to HTTPS, not the reverse (we don't want a forged HTTP header by an eventual attacker to force fallback to HTTP) */
if("https".equals(protocolHeader)) {
@@ -34,6 +34,7 @@
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Locale;
import java.util.Properties;
import java.util.Set;
import java.util.regex.Pattern;
@@ -264,7 +265,7 @@ public DigestURL url() {
public boolean matches(Pattern pattern) {
return pattern.matcher(this.url.toNormalform(true)).matches();
//CharacterRunAutomaton automaton = new CharacterRunAutomaton(matcher);
//boolean match = automaton.run(this.url.toNormalform(true).toLowerCase());
//boolean match = automaton.run(this.url.toNormalform(true).toLowerCase(Locale.ROOT));
//return match;
}
@@ -930,7 +931,7 @@ public String urlstring() {
if (this.alternative_urlstring != null) return this.alternative_urlstring;
if (!pdfParser.individualPages) return this.url().toNormalform(true);
if (!"pdf".equals(MultiProtocolURL.getFileExtension(this.url().getFileName()).toLowerCase())) return this.url().toNormalform(true);
if (!"pdf".equals(MultiProtocolURL.getFileExtension(this.url().getFileName()).toLowerCase(Locale.ROOT))) return this.url().toNormalform(true);
// for pdf links we rewrite the url
// this is a special treatment of pdf files which can be splitted into subpages
String pageprop = pdfParser.individualPagePropertyname;
Oops, something went wrong.

0 comments on commit 8da3174

Please sign in to comment.