Permalink
Browse files

Made mime type and extension normalization locale independent.

Previously, upper cased mime type was incorrectly normalized when the
default locale is Turkish.
  • Loading branch information...
luccioman committed Jun 26, 2017
1 parent 319231a commit 286f3018bd1339d7eda3cf2ffcd663dd2dc562f7
Showing with 69 additions and 4 deletions.
  1. +14 −4 source/net/yacy/document/TextParser.java
  2. +55 −0 test/java/net/yacy/document/TextParserTest.java
@@ -29,6 +29,7 @@
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
@@ -128,6 +129,15 @@
for (Set<Parser> pl: mime2parser.values()) c.addAll(pl);
return c;
}
/**
* @return the set of all supported mime types
*/
public static Set<String> supportedMimeTypes() {
final Set<String> mimeTypes = new HashSet<>();
mimeTypes.addAll(mime2parser.keySet());
return mimeTypes;
}
private static void initParser(final Parser parser) {
String prototypeMime = null;
@@ -145,15 +155,15 @@ private static void initParser(final Parser parser) {
}
if (prototypeMime != null) for (String ext: parser.supportedExtensions()) {
ext = ext.toLowerCase();
ext = ext.toLowerCase(Locale.ROOT);
final String s = ext2mime.get(ext);
if (s != null && !s.equals(prototypeMime)) AbstractParser.log.info("Parser for extension '" + ext + "' was set to mime '" + s + "', overwriting with new mime '" + prototypeMime + "'.");
ext2mime.put(ext, prototypeMime);
}
for (String ext: parser.supportedExtensions()) {
// process the extensions
ext = ext.toLowerCase();
ext = ext.toLowerCase(Locale.ROOT);
LinkedHashSet<Parser> p0 = ext2parser.get(ext);
if (p0 == null) {
p0 = new LinkedHashSet<Parser>();
@@ -518,12 +528,12 @@ public static String mimeOf(final MultiProtocolURL url) {
}
public static String mimeOf(final String ext) {
return ext2mime.get(ext.toLowerCase());
return ext2mime.get(ext.toLowerCase(Locale.ROOT));
}
private static String normalizeMimeType(String mimeType) {
if (mimeType == null) return "application/octet-stream";
mimeType = mimeType.toLowerCase();
mimeType = mimeType.toLowerCase(Locale.ROOT);
final int pos = mimeType.indexOf(';');
return ((pos < 0) ? mimeType.trim() : mimeType.substring(0, pos).trim());
}
@@ -0,0 +1,55 @@
// TextParserTest.java
// ---------------------------
// Copyright 2017 by luccioman; https://github.com/luccioman
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package net.yacy.document;
import static org.junit.Assert.*;
import java.util.Locale;
import org.junit.Test;
/**
* Unit tests for the {@link TextParser} class.
*
* @author luccioman
*
*/
public class TextParserTest {
/**
* Test the TextParser.supportsMime() consistency with available locales.
* Possible failure case : with the Turkish ("tr") language, 'I' lower cased
* does not becomes 'i' but '\u005Cu0131' (the latin small letter 'ı'
* character).
*/
@Test
public void testSupportsMimeLocaleConsistency() {
for (Locale locale : Locale.getAvailableLocales()) {
Locale.setDefault(locale);
for (String mimeType : TextParser.supportedMimeTypes()) {
assertNull(locale + " " + mimeType, TextParser.supportsMime(mimeType.toUpperCase(Locale.ROOT)));
}
}
}
}

0 comments on commit 286f301

Please sign in to comment.