Skip to content

Commit

Permalink
*) some bugfixes for UTF-8 related problems
Browse files Browse the repository at this point in the history
git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2577 6c8d7289-2bf4-0310-a012-ef5d649a1542
  • Loading branch information
theli committed Sep 14, 2006
1 parent f4af607 commit e2f8339
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 7 deletions.
10 changes: 5 additions & 5 deletions source/de/anomic/htmlFilter/htmlFilterContentScraper.java
Expand Up @@ -214,22 +214,22 @@ public void scrapeTag1(String tagname, Properties tagopts, byte[] text) {
}
String h;
if ((tagname.equalsIgnoreCase("h1")) && (text.length < 1024)) {
h = cleanLine(super.stripAll(new serverByteBuffer(text)).toString());
h = cleanLine(super.stripAll(new serverByteBuffer(text)).toString()); // TODO: bugfix needed for UTF-8
if (h.length() > 0) headlines[0].add(h);
}
if ((tagname.equalsIgnoreCase("h2")) && (text.length < 1024)) {
h = cleanLine(super.stripAll(new serverByteBuffer(text)).toString());
h = cleanLine(super.stripAll(new serverByteBuffer(text)).toString()); // TODO: bugfix needed for UTF-8
if (h.length() > 0) headlines[1].add(h);
}
if ((tagname.equalsIgnoreCase("h3")) && (text.length < 1024)) {
h = cleanLine(super.stripAll(new serverByteBuffer(text)).toString());
h = cleanLine(super.stripAll(new serverByteBuffer(text)).toString()); // TODO: bugfix needed for UTF-8
if (h.length() > 0) headlines[2].add(h);
}
if ((tagname.equalsIgnoreCase("h4")) && (text.length < 1024)) {
h = cleanLine(super.stripAll(new serverByteBuffer(text)).toString());
h = cleanLine(super.stripAll(new serverByteBuffer(text)).toString()); // TODO: bugfix needed for UTF-8
if (h.length() > 0) headlines[3].add(h);
}
if ((tagname.equalsIgnoreCase("title")) && (text.length < 1024)) title = cleanLine(super.stripAll(new serverByteBuffer(text)).toString());
if ((tagname.equalsIgnoreCase("title")) && (text.length < 1024)) title = cleanLine(super.stripAll(new serverByteBuffer(text)).toString()); // TODO: bugfix needed for UTF-8
}

private static String cleanLine(String s) {
Expand Down
9 changes: 8 additions & 1 deletion source/de/anomic/htmlFilter/htmlFilterOutputStream.java
Expand Up @@ -58,6 +58,7 @@
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.util.Enumeration;
import java.util.Properties;
Expand Down Expand Up @@ -157,7 +158,13 @@ public static byte[] genOpts(Properties prop, byte quotechar) {
while (e.hasMoreElements()) {
key = (String) e.nextElement();
bb = bb.append((byte) 32).append(key.getBytes()).append((byte) '=');
bb = bb.append(quotechar).append(prop.getProperty(key).getBytes()).append(quotechar);
bb = bb.append(quotechar);
try {
bb.append(prop.getProperty(key).getBytes("UTF-8"));
} catch (UnsupportedEncodingException e1) {
bb.append(prop.getProperty(key).getBytes());
}
bb.append(quotechar);
}
if (bb.length() > 0) return bb.getBytes(1);
return bb.getBytes();
Expand Down
7 changes: 6 additions & 1 deletion source/de/anomic/server/serverByteBuffer.java
Expand Up @@ -45,6 +45,7 @@
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.OutputStream;
import java.io.UnsupportedEncodingException;
import java.util.Properties;

public final class serverByteBuffer extends OutputStream {
Expand Down Expand Up @@ -379,7 +380,11 @@ public Properties propParser() {
start = pos;
while ((pos < length) && (buffer[pos] != doublequote)) pos++;
if (pos >= length) break; // this is the case if we found no parent doublequote
p.setProperty(key, new String(buffer, start, pos - start).trim());
try {
p.setProperty(key, new String(buffer, start, pos - start,"UTF-8").trim());
} catch (UnsupportedEncodingException e) {
p.setProperty(key, new String(buffer, start, pos - start).trim());
}
pos++;
} else if (buffer[pos] == singlequote) {
// search next singlequote
Expand Down

0 comments on commit e2f8339

Please sign in to comment.