Skip to content

Commit

Permalink
added some UTF-8 handling.
Browse files Browse the repository at this point in the history
hope this will help somehow.. for shure not THE solution to our UTF-8 problem


git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1308 6c8d7289-2bf4-0310-a012-ef5d649a1542
  • Loading branch information
orbiter committed Jan 10, 2006
1 parent 7586fcb commit 9544c47
Show file tree
Hide file tree
Showing 34 changed files with 168 additions and 90 deletions.
9 changes: 8 additions & 1 deletion htroot/MessageSend_p.java
Expand Up @@ -44,6 +44,7 @@
// javac -classpath .:../Classes MessageSend_p.java
// if the shell's current path is HTROOT

import java.io.UnsupportedEncodingException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.HashMap;
Expand Down Expand Up @@ -128,7 +129,13 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
if (messagesize < 1000) messagesize = 1000; // debug
if (subject.length() > 100) subject = subject.substring(0, 100);
if (message.length() > messagesize) message = message.substring(0, messagesize);
HashMap result = yacyClient.postMessage(hash, subject, message.getBytes());
byte[] mb;
try {
mb = message.getBytes("UTF-8");
} catch (UnsupportedEncodingException e) {
mb = message.getBytes();
}
HashMap result = yacyClient.postMessage(hash, subject, mb);
body += "<p>Your message has been sent. The target peer responded:</p>";
body += "<p><i>" + result.get("response") + "</i></p>";
} catch (NumberFormatException e) {
Expand Down
16 changes: 10 additions & 6 deletions htroot/Wiki.java
Expand Up @@ -73,7 +73,7 @@ public static String dateString(Date date) {
}


public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) {
public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) throws IOException {
plasmaSwitchboard switchboard = (plasmaSwitchboard) env;
serverObjects prop = new serverObjects();
if (post == null) {
Expand All @@ -94,11 +94,15 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
}
}

if (post.containsKey("submit")) {
// store a new page
switchboard.wikiDB.write(switchboard.wikiDB.newEntry(pagename, author, ip,
post.get("reason", "edit"),
post.get("content", "").getBytes()));
if (post.containsKey("submit")) {
// store a new page
byte[] content;
try {
content = post.get("content", "").getBytes("UTF-8");
} catch (UnsupportedEncodingException e) {
content = post.get("content", "").getBytes();
}
switchboard.wikiDB.write(switchboard.wikiDB.newEntry(pagename, author, ip, post.get("reason", "edit"), content));
// create a news message
HashMap map = new HashMap();
map.put("page", pagename);
Expand Down
6 changes: 3 additions & 3 deletions htroot/htdocsdefault/dir.java
Expand Up @@ -173,7 +173,7 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
try {
serverFileUtils.write(binary, newfile);
String md5s = serverCodings.encodeMD5Hex(newfile);
serverFileUtils.write((md5s + "\n" + description).getBytes(), newfilemd5); // generate md5
serverFileUtils.write((md5s + "\n" + description).getBytes("UTF-8"), newfilemd5); // generate md5

// index file info
if (post.get("indexing", "").equals("on")) {
Expand Down Expand Up @@ -262,7 +262,7 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
// generate md5 on-the-fly
md5s = serverCodings.encodeMD5Hex(f);
description = "";
serverFileUtils.write((md5s + "\n" + description).getBytes(), fmd5);
serverFileUtils.write((md5s + "\n" + description).getBytes("UTF-8"), fmd5);
}
} catch (IOException e) {
md5s = "";
Expand Down Expand Up @@ -478,7 +478,7 @@ public static void indexPhrase(plasmaSwitchboard switchboard, String urlstring,
public static void deletePhrase(plasmaSwitchboard switchboard, String urlstring, String phrase, String descr) {
try {
final String urlhash = plasmaURL.urlHash(new URL(urlstring));
final Set words = plasmaCondenser.getWords(("yacyshare " + phrase + " " + descr).getBytes());
final Set words = plasmaCondenser.getWords(("yacyshare " + phrase + " " + descr).getBytes("UTF-8"));
switchboard.removeReferences(urlhash, words);
switchboard.urlPool.loadedURL.remove(urlhash);
} catch (Exception e) {
Expand Down
9 changes: 8 additions & 1 deletion htroot/yacy/message.java
Expand Up @@ -49,6 +49,7 @@
import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.UnsupportedEncodingException;
import java.text.SimpleDateFormat;
import java.util.Date;
import de.anomic.data.messageBoard;
Expand Down Expand Up @@ -125,11 +126,17 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve

// save message
messageBoard.entry msgEntry = null;
byte[] mb;
try {
mb = message.getBytes("UTF-8");
} catch (UnsupportedEncodingException e) {
mb = message.getBytes();
}
sb.messageDB.write(msgEntry = sb.messageDB.newEntry(
"remote",
otherSeed.get(yacySeed.NAME, "anonymous"), otherSeed.hash,
yacyCore.seedDB.mySeed.getName(), yacyCore.seedDB.mySeed.hash,
subject, message.getBytes()));
subject, mb));

messageForwardingViaEmail(ss, msgEntry);

Expand Down
1 change: 1 addition & 0 deletions source/dbtest.java
Expand Up @@ -24,6 +24,7 @@ public class dbtest {

public final static int keylength = 12;
public final static int valuelength = 223; // sum of all data length as defined in plasmaURL
//public final static long buffer = 0;
public final static long buffer = 8192 * 1024; // 8 MB buffer
public static byte[] dummyvalue1 = new byte[valuelength];
public static byte[] dummyvalue2 = new byte[valuelength];
Expand Down
8 changes: 4 additions & 4 deletions source/de/anomic/data/wikiBoard.java
Expand Up @@ -149,7 +149,7 @@ public String guessAuthor(String ip) {
return author;
}

public entry newEntry(String subject, String author, String ip, String reason, byte[] page) {
public entry newEntry(String subject, String author, String ip, String reason, byte[] page) throws IOException {
return new entry(normalize(subject), author, ip, reason, page);
}

Expand All @@ -158,17 +158,17 @@ public class entry {
String key;
Map record;

public entry(String subject, String author, String ip, String reason, byte[] page) {
public entry(String subject, String author, String ip, String reason, byte[] page) throws IOException {
record = new HashMap();
key = subject;
if (key.length() > keyLength) key = key.substring(0, keyLength);
record.put("date", dateString());
if ((author == null) || (author.length() == 0)) author = "anonymous";
record.put("author", kelondroBase64Order.enhancedCoder.encode(author.getBytes()));
record.put("author", kelondroBase64Order.enhancedCoder.encode(author.getBytes("UTF-8")));
if ((ip == null) || (ip.length() == 0)) ip = "";
record.put("ip", ip);
if ((reason == null) || (reason.length() == 0)) reason = "";
record.put("reason", kelondroBase64Order.enhancedCoder.encode(reason.getBytes()));
record.put("reason", kelondroBase64Order.enhancedCoder.encode(reason.getBytes("UTF-8")));
if (page == null)
record.put("page", "");
else
Expand Down
7 changes: 6 additions & 1 deletion source/de/anomic/data/wikiCode.java
Expand Up @@ -50,6 +50,7 @@
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;

import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverCore;
Expand All @@ -73,7 +74,11 @@ public wikiCode(plasmaSwitchboard switchboard){
}

public String transform(String content){
return transform(content.getBytes(), sb);
try {
return transform(content.getBytes("UTF-8"), sb);
} catch (UnsupportedEncodingException e) {
return transform(content.getBytes(), sb);
}
}
public String transform(byte[] content){
return transform(content, sb);
Expand Down
2 changes: 1 addition & 1 deletion source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion source/de/anomic/htmlFilter/htmlFilterContentScraper.java
Expand Up @@ -230,7 +230,7 @@ public String getHeadline() {
}

public byte[] getText() {
return content.getBytes();
return content.getBytes();
}

public Map getAnchors() {
Expand Down
Expand Up @@ -46,6 +46,7 @@
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.UnsupportedEncodingException;
import java.text.Collator;
import java.util.ArrayList;
import java.util.Locale;
Expand Down Expand Up @@ -117,7 +118,12 @@ private static byte[] genBlueLetters(int length) {

private boolean hit(byte[] text) {
if (text == null || bluelist == null) return false;
String lc = new String(text).toLowerCase();
String lc;
try {
lc = new String(text, "UTF-8").toLowerCase();
} catch (UnsupportedEncodingException e) {
lc = new String(text).toLowerCase();
}
for (int i = 0; i < bluelist.size(); i++) {
if (lc.indexOf((String) bluelist.get(i)) >= 0) return true;
}
Expand Down
2 changes: 1 addition & 1 deletion source/de/anomic/http/httpd.java
Expand Up @@ -710,7 +710,7 @@ public static int parseArgs(serverObjects args, InputStream in, int length) thro
bout.close(); bout = null;
}

int argc = parseArgs(args, new String(buffer));
int argc = parseArgs(args, new String(buffer, "UTF-8"));
buffer = null;
return argc;
}
Expand Down
2 changes: 1 addition & 1 deletion source/de/anomic/kelondro/kelondroAbstractRA.java
Expand Up @@ -152,7 +152,7 @@ public String readLine() throws IOException {
return new String(bb, 0, bbsize);
}
if (c == cr) continue;
if (c == lf) return new String(bb, 0, bbsize);
if (c == lf) return new String(bb, 0, bbsize, "UTF-8");

// append to bb
if (bbsize == bb.length) {
Expand Down
32 changes: 30 additions & 2 deletions source/de/anomic/kelondro/kelondroArray.java
Expand Up @@ -98,7 +98,6 @@ public synchronized byte[][] get(int index) throws IOException {
return getNode(new Handle(index)).getValues();
}


public synchronized int seti(int index, int value) throws IOException {
int before = getHandle(index).hashCode();
setHandle(index, new Handle(value));
Expand All @@ -109,13 +108,28 @@ public synchronized int geti(int index) {
return getHandle(index).hashCode();
}

public synchronized int add(byte[][] row) throws IOException {
if (row.length != columns())
throw new IllegalArgumentException("add: wrong row length " + row.length + "; must be " + columns());

Node n = newNode();
n.commit(CP_LOW);
int index = n.handle().hashCode();
set(index, row);
return index;
}

public synchronized void remove(int index) throws IOException {
deleteNode(new Handle(index));
}

public void print() throws IOException {
System.out.println("PRINTOUT of table, length=" + size());
byte[][] row;
for (int i = 0; i < size(); i++) {
System.out.print("row " + i + ": ");
row = get(i);
for (int j = 0; j < columns(); j++) System.out.print(((row[j] == null) ? "NULL" : new String(row[j])) + ", ");
for (int j = 0; j < columns(); j++) System.out.print(((row[j] == null) ? "NULL" : new String(row[j], "UTF-8")) + ", ");
System.out.println();
}
System.out.println("EndOfTable");
Expand Down Expand Up @@ -160,6 +174,20 @@ private static void cmd(String[] args) {
fm.set(Integer.parseInt(args[2]), row);
fm.close();
} else
if ((args.length == 3) && (args[0].equals("-a"))) {
// add <filename> <value>
kelondroArray fm = new kelondroArray(new File(args[1]));
byte[][] row = new byte[][] { args[2].getBytes() };
int index = fm.add(row);
System.out.println("Added to row " + index);
fm.close();
} else
if ((args.length == 3) && (args[0].equals("-d"))) {
// delete <filename> <index>
kelondroArray fm = new kelondroArray(new File(args[1]));
fm.remove(Integer.parseInt(args[2]));
fm.close();
} else
if ((args.length == 1) && (args[0].equals("-test"))) {
File testfile = new File("test.array");
if (testfile.exists()) testfile.delete();
Expand Down
21 changes: 17 additions & 4 deletions source/de/anomic/kelondro/kelondroBase64Order.java
Expand Up @@ -52,22 +52,34 @@ public class kelondroBase64Order extends kelondroAbstractOrder implements kelond

private static final char[] alpha_standard = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/".toCharArray();
private static final char[] alpha_enhanced = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_".toCharArray();
private static final byte[] ahpla_standard = new byte[256];
private static final byte[] ahpla_enhanced = new byte[256];

static {
for (int i = 0; i < 256; i++) {
ahpla_standard[i] = -1;
ahpla_enhanced[i] = -1;
}
for (int i = 0; i < alpha_standard.length; i++) {
ahpla_standard[alpha_standard[i]] = (byte) i;
ahpla_enhanced[alpha_enhanced[i]] = (byte) i;
}
}

public static final kelondroBase64Order standardCoder = new kelondroBase64Order(true);
public static final kelondroBase64Order enhancedCoder = new kelondroBase64Order(false);

final boolean rfc1113compliant;

private final char[] alpha;
private final byte[] ahpla = new byte[256];
private final byte[] ahpla;

public kelondroBase64Order(boolean rfc1113compliant) {
// if we choose not to be rfc1113compliant,
// then we get shorter base64 results which are also filename-compatible
this.rfc1113compliant = rfc1113compliant;
alpha = (rfc1113compliant) ? alpha_standard : alpha_enhanced;
for (int i = 0; i < 256; i++) ahpla[i] = -1;
for (int i = 0; i < alpha.length; i++) ahpla[alpha[i]] = (byte) i;
ahpla = (rfc1113compliant) ? ahpla_standard : ahpla_enhanced;
}

public char encodeByte(byte b) {
Expand Down Expand Up @@ -143,7 +155,8 @@ public String encode(byte[] in) {

public String decodeString(String in) {
try {
return new String(decode(in), "ISO-8859-1");
//return new String(decode(in), "ISO-8859-1");
return new String(decode(in), "UTF-8");
} catch (java.io.UnsupportedEncodingException e) {
System.out.println("internal error in base64: " + e.getMessage());
return null;
Expand Down
2 changes: 1 addition & 1 deletion source/de/anomic/kelondro/kelondroHashtable.java
Expand Up @@ -230,7 +230,7 @@ private Object[] search(Hash hash) throws IOException {
rowNumber = hash.node();
if (rowNumber >= hashArray.size()) return new Object[]{new Integer(rowNumber), null};
row = hashArray.get(rowNumber);
rowKey = (int) kelondroBase64Order.enhancedCoder.decodeLong(new String(row[0]));
rowKey = (int) kelondroBase64Order.enhancedCoder.decodeLong(new String(row[0], "UTF-8"));
if (rowKey == 0) return new Object[]{new Integer(rowNumber), null};
hash.rehash();
} while (rowKey != hash.key());
Expand Down
4 changes: 4 additions & 0 deletions source/de/anomic/kelondro/kelondroNaturalOrder.java
Expand Up @@ -68,6 +68,10 @@ public long cardinal(byte[] key) {
// two arrays are also equal if one array is a subset of the other's array
// with filled-up char(0)-values
public int compare(byte[] a, byte[] b) {
return compares(a, b);
}

public static final int compares(byte[] a, byte[] b) {
int i = 0;
final int al = a.length;
final int bl = b.length;
Expand Down
2 changes: 1 addition & 1 deletion source/de/anomic/kelondro/kelondroRecords.java
Expand Up @@ -790,7 +790,7 @@ public String toString() {
if (h == null) s = s + ":hNULL"; else s = s + ":h" + h.toString();
}
byte[][] content = getValues();
for (int i = 0; i < content.length; i++) s = s + ":" + ((content[i] == null) ? "NULL" : (new String(content[i])).trim());
for (int i = 0; i < content.length; i++) s = s + ":" + ((content[i] == null) ? "NULL" : (new String(content[i], "UTF-8")).trim());
} catch (IOException e) {
s = s + ":***LOAD ERROR***:" + e.getMessage();
}
Expand Down
4 changes: 2 additions & 2 deletions source/de/anomic/plasma/parser/odt/odtParser.java
Expand Up @@ -130,11 +130,11 @@ public plasmaParserDocument parse(URL location, String mimeType, File dest) thro
if (docShortTitle != null) {
docLongTitle = docShortTitle;
} else if (docContent.length <= 80) {
docLongTitle = new String(docContent);
docLongTitle = new String(docContent, "UTF-8");
} else {
byte[] title = new byte[80];
System.arraycopy(docContent, 0, title, 0, 80);
docLongTitle = new String(title);
docLongTitle = new String(title, "UTF-8");
}
docLongTitle.
replaceAll("\r\n"," ").
Expand Down
2 changes: 1 addition & 1 deletion source/de/anomic/plasma/parser/pdf/pdfParser.java
Expand Up @@ -127,7 +127,7 @@ public plasmaParserDocument parse(URL location, String mimeType, InputStream sou
out = null;

if ((docTitle == null) || (docTitle.length() == 0)) {
docTitle = ((contents.length > 80)? new String(contents, 0, 80):new String(contents)).
docTitle = ((contents.length > 80)? new String(contents, 0, 80, "UTF-8"):new String(contents, "UTF-8")).
replaceAll("\r\n"," ").
replaceAll("\n"," ").
replaceAll("\r"," ").
Expand Down
18 changes: 9 additions & 9 deletions source/de/anomic/plasma/plasmaCrawlEURL.java
Expand Up @@ -171,15 +171,15 @@ public Entry(String hash) throws IOException {
this.hash = hash;
byte[][] entry = urlHashCache.get(hash.getBytes());
if (entry != null) {
this.referrer = new String(entry[1]);
this.initiator = new String(entry[2]);
this.executor = new String(entry[3]);
this.url = new URL(new String(entry[4]).trim());
this.name = new String(entry[5]).trim();
this.initdate = new Date(86400000 * kelondroBase64Order.enhancedCoder.decodeLong(new String(entry[6])));
this.trydate = new Date(86400000 * kelondroBase64Order.enhancedCoder.decodeLong(new String(entry[7])));
this.trycount = (int) kelondroBase64Order.enhancedCoder.decodeLong(new String(entry[8]));
this.failreason = new String(entry[9]);
this.referrer = new String(entry[1], "UTF-8");
this.initiator = new String(entry[2], "UTF-8");
this.executor = new String(entry[3], "UTF-8");
this.url = new URL(new String(entry[4], "UTF-8").trim());
this.name = new String(entry[5], "UTF-8").trim();
this.initdate = new Date(86400000 * kelondroBase64Order.enhancedCoder.decodeLong(new String(entry[6], "UTF-8")));
this.trydate = new Date(86400000 * kelondroBase64Order.enhancedCoder.decodeLong(new String(entry[7], "UTF-8")));
this.trycount = (int) kelondroBase64Order.enhancedCoder.decodeLong(new String(entry[8], "UTF-8"));
this.failreason = new String(entry[9], "UTF-8");
this.flags = new bitfield(entry[10]);
return;
}
Expand Down

0 comments on commit 9544c47

Please sign in to comment.