*) some TODO makers for UTF-8 problem

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2578 6c8d7289-2bf4-0310-a012-ef5d649a1542
yacy · Sep 14, 2006 · 9ecf7f0 · 9ecf7f0
1 parent e2f8339
commit 9ecf7f0
Showing 1 changed file with 9 additions and 5 deletions.
diff --git a/source/de/anomic/plasma/plasmaCondenser.java b/source/de/anomic/plasma/plasmaCondenser.java
@@ -190,7 +190,7 @@ private void createCondensement(InputStream is) {
         // read source
         sievedWordsEnum wordenum = new sievedWordsEnum(is, wordminsize);
         while (wordenum.hasMoreElements()) {
-            word = ((String) wordenum.nextElement()).toLowerCase();
+            word = ((String) wordenum.nextElement()).toLowerCase(); // TODO: does toLowerCase work for non ISO-8859-1 chars?
             // System.out.println("PARSED-WORD " + word);
             wordlen = word.length();
             if ((wordlen == 1) && (punctuation(word.charAt(0)))) {
@@ -389,6 +389,7 @@ public String[] sentences() {
         String s;
         for (int i = 0; i < orderedSentences.length; i++) {
             if (orderedSentences[i] != null) {
+                // TODO: bugfix for UTF-8: avoid this form of string concatenation
                 s = "";
                 for (int j = 2; j < ((String[]) orderedSentences[i]).length; j++) {
                     s += " " + orderedWords[Integer.parseInt(((String[]) orderedSentences[i])[j])];
@@ -455,7 +456,7 @@ public void writeMapToFile(File out) throws IOException {
         it = sortedWords.entrySet().iterator(); // enumerates the keys in descending order
         while (it.hasNext()) {
             entry = (Map.Entry) it.next();
-            k = (String) entry.getKey();
+            k = (String) entry.getKey();            
             writer.write("#W " + k.substring(numlength) + " " + k.substring(0, numlength) + " " + ((String) entry.getValue()) + "\r\n");
         }
         for (int i = 0; i < orderedSentences.length; i++) {
@@ -475,12 +476,14 @@ protected final static boolean punctuation(char c) {
     }
 
     public final static boolean invisible(char c) {
+        // TODO: Bugfix for UTF-8: does this work for non ISO-8859-1 chars?
         if ((c < ' ') || (c > 'z')) return true;
         return ("$%&/()=\"$%&/()=`^+*~#'-_:;,|<>[]\\".indexOf(c) >= 0);
     }
 
     public static Enumeration wordTokenizer(String s, int minLength) {
         try {
+            // TODO: Bugfix for UTF-8 needed
             return new sievedWordsEnum(new ByteArrayInputStream(s.getBytes()), minLength);
         } catch (Exception e) {
             return null;
@@ -509,6 +512,7 @@ private Object nextElement0() {
                 if (s.length() < ml) continue loop;
                 for (int i = 0; i < s.length(); i++) {
                     c = s.charAt(i);
+                    // TODO: Bugfix needed for UTF-8
                     if (((c < 'a') || (c > 'z')) &&
                         ((c < 'A') || (c > 'Z')) &&
                         ((c < '0') || (c > '9')))
@@ -558,11 +562,11 @@ private Object nextElement0() {
                     sb = new StringBuffer(r.length() * 2);
                     for (int i = 0; i < r.length(); i++) {
                         c = r.charAt(i);
-                        if (invisible(c)) sb = sb.append(' ');
+                        if (invisible(c)) sb = sb.append(' '); // TODO: Bugfix needed for UTF-8
                         else if (punctuation(c)) sb = sb.append(' ').append(c).append(' ');
                         else sb = sb.append(c);
                     }
-                    s = sb.toString().trim();
+                    s = sb.toString().trim(); 
                     //System.out.println("PARSING-LINE '" + r + "'->'" + s + "'");
                 } else {
                     return null;
@@ -603,7 +607,7 @@ private static class linesFromFileEnum implements Enumeration {
         int counter = 0;
 
         public linesFromFileEnum(InputStream is) {
-            raf = new BufferedReader(new InputStreamReader(is));
+            raf = new BufferedReader(new InputStreamReader(is)); // TODO: bugfix needed for UTF-8, use charset for reader
             buffer = nextElement0();
             counter = 0;
         }