Skip to content

Commit

Permalink
*) changes needed for multi-language support
Browse files Browse the repository at this point in the history
   - parsers may need to know the charset of the byte stream 

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2591 6c8d7289-2bf4-0310-a012-ef5d649a1542
  • Loading branch information
theli committed Sep 15, 2006
1 parent 31d6cde commit d0a5a53
Show file tree
Hide file tree
Showing 23 changed files with 133 additions and 65 deletions.
12 changes: 12 additions & 0 deletions source/de/anomic/htmlFilter/htmlFilterContentScraper.java
Expand Up @@ -47,6 +47,8 @@
import de.anomic.net.URL;

import java.net.MalformedURLException;
import java.nio.charset.Charset;
import java.nio.charset.UnsupportedCharsetException;
import java.text.Collator;
import java.util.ArrayList;
import java.util.HashMap;
Expand Down Expand Up @@ -93,7 +95,9 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
//private String headline;
private List[] headlines;
private serverByteBuffer content;

private URL root;
private String charset = "UTF-8";

public htmlFilterContentScraper(URL root) {
// the root value here will not be used to load the resource.
Expand All @@ -109,6 +113,14 @@ public htmlFilterContentScraper(URL root) {
this.content = new serverByteBuffer(1024);
}

public void setCharset(String charset) throws UnsupportedCharsetException {
// testing if charset exists
Charset.forName(charset);

// remember it
this.charset = charset;
}

public void scrapeText(byte[] newtext) {
// System.out.println("SCRAPE: " + new String(newtext));
if ((content.length() != 0) && (content.byteAt(content.length() - 1) != 32)) content.append(32);
Expand Down
6 changes: 6 additions & 0 deletions source/de/anomic/plasma/cache/IResourceInfo.java
Expand Up @@ -82,6 +82,12 @@ public interface IResourceInfo {
*/
public String getMimeType();

/**
* Returns the charset of the resource
* @return returns the name of the charset or <code>null</code> if unknown
*/
public String getCharSet();

/**
* Returns the modification date of the cached object
* @return the modifiaction date
Expand Down
4 changes: 4 additions & 0 deletions source/de/anomic/plasma/cache/ftp/ResourceInfo.java
Expand Up @@ -161,4 +161,8 @@ public boolean validResponseStatus(String responseStatus) {
return responseStatus != null && responseStatus.equalsIgnoreCase("OK");
}

public String getCharSet() {
return null;
}

}
9 changes: 9 additions & 0 deletions source/de/anomic/plasma/cache/http/ResourceInfo.java
Expand Up @@ -110,6 +110,15 @@ public String getMimeType() {
int pos = mimeType.indexOf(';');
return ((pos < 0) ? mimeType : mimeType.substring(0, pos));
}

public String getCharSet() {
if (this.responseHeader == null) return null;

String mimeType = this.responseHeader.mime();

int pos = mimeType.indexOf(';');
return ((pos < 0) ? null : mimeType.substring(pos));
}

/**
* @see de.anomic.plasma.cache.IResourceInfo#getModificationDate()
Expand Down
19 changes: 13 additions & 6 deletions source/de/anomic/plasma/parser/AbstractParser.java
Expand Up @@ -103,6 +103,7 @@ public static final void checkInterruption() throws InterruptedException {
* Parsing a document available as byte array.
* @param location the origin of the document
* @param mimeType the mimetype of the document
* @param charset the supposed charset of the document or <code>null</code> if unkown
* @param source the content byte array
* @return a {@link plasmaParserDocument} containing the extracted plain text of the document
* and some additional metadata.
Expand All @@ -113,12 +114,13 @@ public static final void checkInterruption() throws InterruptedException {
public plasmaParserDocument parse(
URL location,
String mimeType,
String charset,
byte[] source
) throws ParserException, InterruptedException {
ByteArrayInputStream contentInputStream = null;
try {
contentInputStream = new ByteArrayInputStream(source);
return this.parse(location,mimeType,contentInputStream);
return this.parse(location,mimeType,charset,contentInputStream);
} finally {
if (contentInputStream != null) {
try {
Expand All @@ -133,19 +135,24 @@ public plasmaParserDocument parse(
* Parsing a document stored in a {@link File}
* @param location the origin of the document
* @param mimeType the mimetype of the document
* @param charset the supposed charset of the document or <code>null</code> if unkown
* @param sourceFile the file containing the content of the document
* @return a {@link plasmaParserDocument} containing the extracted plain text of the document
* and some additional metadata.
* @throws ParserException if the content could not be parsed properly
*
* @see de.anomic.plasma.parser.Parser#parse(de.anomic.net.URL, java.lang.String, java.io.File)
*/
public plasmaParserDocument parse(URL location, String mimeType,
File sourceFile) throws ParserException, InterruptedException {
public plasmaParserDocument parse(
URL location,
String mimeType,
String charset,
File sourceFile
) throws ParserException, InterruptedException {
BufferedInputStream contentInputStream = null;
try {
contentInputStream = new BufferedInputStream(new FileInputStream(sourceFile));
return this.parse(location, mimeType, contentInputStream);
return this.parse(location, mimeType, charset, contentInputStream);
} catch (FileNotFoundException e) {
throw new ParserException(e.getMessage());
} finally {
Expand All @@ -157,15 +164,15 @@ public plasmaParserDocument parse(URL location, String mimeType,
* Parsing a document available as {@link InputStream}
* @param location the origin of the document
* @param mimeType the mimetype of the document
* @param charset the supposed charset of the document or <code>null</code> if unkown
* @param source the {@link InputStream} containing the document content
* @return a {@link plasmaParserDocument} containing the extracted plain text of the document
* and some additional metadata.
* @throws ParserException if the content could not be parsed properly
*
* @see de.anomic.plasma.parser.Parser#parse(de.anomic.net.URL, java.lang.String, java.io.InputStream)
*/
public abstract plasmaParserDocument parse(URL location, String mimeType,
InputStream source) throws ParserException, InterruptedException;
public abstract plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException;

/**
* @return Returns a list of library names that are needed by this parser
Expand Down
9 changes: 6 additions & 3 deletions source/de/anomic/plasma/parser/Parser.java
Expand Up @@ -64,39 +64,42 @@ public interface Parser {
* Parsing a document available as byte array
* @param location the origin of the document
* @param mimeType the mimetype of the document
* @param charset the supposed charset of the document or <code>null</code> if unkown
* @param source the content byte array
* @return a {@link plasmaParserDocument} containing the extracted plain text of the document
* and some additional metadata.
*
* @throws ParserException if the content could not be parsed properly
*/
public plasmaParserDocument parse(URL location, String mimeType, byte[] source)
public plasmaParserDocument parse(URL location, String mimeType, String charset, byte[] source)
throws ParserException, InterruptedException;

/**
* Parsing a document stored in a {@link File}
* @param location the origin of the document
* @param mimeType the mimetype of the document
* @param charset the supposed charset of the document or <code>null</code> if unkown
* @param sourceFile the file containing the content of the document
* @return a {@link plasmaParserDocument} containing the extracted plain text of the document
* and some additional metadata.
*
* @throws ParserException if the content could not be parsed properly
*/
public plasmaParserDocument parse(URL location, String mimeType, File sourceFile)
public plasmaParserDocument parse(URL location, String mimeType, String charset, File sourceFile)
throws ParserException, InterruptedException;

/**
* Parsing a document available as {@link InputStream}
* @param location the origin of the document
* @param mimeType the mimetype of the document
* @param charset the supposed charset of the document or <code>null</code> if unkown
* @param source the {@link InputStream} containing the document content
* @return a {@link plasmaParserDocument} containing the extracted plain text of the document
* and some additional metadata.
*
* @throws ParserException if the content could not be parsed properly
*/
public plasmaParserDocument parse(URL location, String mimeType, InputStream source)
public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source)
throws ParserException, InterruptedException;

/**
Expand Down
4 changes: 2 additions & 2 deletions source/de/anomic/plasma/parser/bzip/bzipParser.java
Expand Up @@ -87,7 +87,7 @@ public Hashtable getSupportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}

public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException, InterruptedException {
public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {

File tempFile = null;
try {
Expand Down Expand Up @@ -126,7 +126,7 @@ public plasmaParserDocument parse(URL location, String mimeType, InputStream sou

// creating a new parser class to parse the unzipped content
plasmaParser theParser = new plasmaParser();
return theParser.parseSource(location,null,tempFile);
return theParser.parseSource(location,null,null,tempFile);
} catch (Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
throw new ParserException("Unable to parse the gzip content. " + e.getMessage());
Expand Down
2 changes: 1 addition & 1 deletion source/de/anomic/plasma/parser/doc/docParser.java
Expand Up @@ -78,7 +78,7 @@ public docParser() {
parserName = "Word Document Parser";
}

public plasmaParserDocument parse(URL location, String mimeType,
public plasmaParserDocument parse(URL location, String mimeType, String charset,
InputStream source) throws ParserException, InterruptedException {


Expand Down
4 changes: 2 additions & 2 deletions source/de/anomic/plasma/parser/gzip/gzipParser.java
Expand Up @@ -83,7 +83,7 @@ public Hashtable getSupportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}

public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException, InterruptedException {
public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {

File tempFile = null;
try {
Expand All @@ -110,7 +110,7 @@ public plasmaParserDocument parse(URL location, String mimeType, InputStream sou

// creating a new parser class to parse the unzipped content
plasmaParser theParser = new plasmaParser();
return theParser.parseSource(location,null,tempFile);
return theParser.parseSource(location,null,null,tempFile);
} catch (Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
throw new ParserException("Unable to parse the gzip content. " + e.getMessage());
Expand Down
8 changes: 4 additions & 4 deletions source/de/anomic/plasma/parser/mimeType/mimeTypeParser.java
Expand Up @@ -125,7 +125,7 @@ public String getMimeType (File sourceFile) {
return null;
}

public plasmaParserDocument parse(URL location, String mimeType, File sourceFile) throws ParserException, InterruptedException {
public plasmaParserDocument parse(URL location, String mimeType, String charset, File sourceFile) throws ParserException, InterruptedException {

String orgMimeType = mimeType;

Expand Down Expand Up @@ -168,7 +168,7 @@ public plasmaParserDocument parse(URL location, String mimeType, File sourceFile

// parsing the content using the determined mimetype
plasmaParser theParser = new plasmaParser();
return theParser.parseSource(location,mimeType,sourceFile);
return theParser.parseSource(location,mimeType,charset,sourceFile);
}
return null;

Expand All @@ -185,13 +185,13 @@ public plasmaParserDocument parse(URL location, String mimeType, File sourceFile
}
}

public plasmaParserDocument parse(URL location, String mimeType,
public plasmaParserDocument parse(URL location, String mimeType,String charset,
InputStream source) throws ParserException {
File dstFile = null;
try {
dstFile = File.createTempFile("mimeTypeParser",".tmp");
serverFileUtils.copy(source,dstFile);
return parse(location,mimeType,dstFile);
return parse(location,mimeType,charset,dstFile);
} catch (Exception e) {
return null;
} finally {
Expand Down
8 changes: 4 additions & 4 deletions source/de/anomic/plasma/parser/odt/odtParser.java
Expand Up @@ -91,7 +91,7 @@ public Hashtable getSupportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}

public plasmaParserDocument parse(URL location, String mimeType, File dest) throws ParserException, InterruptedException {
public plasmaParserDocument parse(URL location, String mimeType, String charset, File dest) throws ParserException, InterruptedException {

try {
byte[] docContent = null;
Expand Down Expand Up @@ -168,7 +168,7 @@ public plasmaParserDocument parse(URL location, String mimeType, File dest) thro
}
}

public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException {
public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException {
File dest = null;
try {
// creating a tempfile
Expand All @@ -179,7 +179,7 @@ public plasmaParserDocument parse(URL location, String mimeType, InputStream sou
serverFileUtils.copy(source, dest);

// parsing the content
return parse(location, mimeType, dest);
return parse(location, mimeType, charset, dest);
} catch (Exception e) {
throw new ParserException("Unable to parse the odt document. " + e.getMessage());
} finally {
Expand Down Expand Up @@ -210,7 +210,7 @@ public static void main(String[] args) {
ByteArrayInputStream input = new ByteArrayInputStream(content);

// parsing the document
testParser.parse(contentUrl, "application/vnd.oasis.opendocument.text", input);
testParser.parse(contentUrl, "application/vnd.oasis.opendocument.text", null, input);
} catch (Exception e) {
e.printStackTrace();
}
Expand Down
2 changes: 1 addition & 1 deletion source/de/anomic/plasma/parser/pdf/pdfParser.java
Expand Up @@ -85,7 +85,7 @@ public Hashtable getSupportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}

public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException, InterruptedException {
public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {


PDDocument theDocument = null;
Expand Down
8 changes: 4 additions & 4 deletions source/de/anomic/plasma/parser/rpm/rpmParser.java
Expand Up @@ -91,21 +91,21 @@ public Hashtable getSupportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}

public plasmaParserDocument parse(URL location, String mimeType,
public plasmaParserDocument parse(URL location, String mimeType, String charset,
InputStream source) throws ParserException {
File dstFile = null;
try {
dstFile = File.createTempFile("rpmParser",".tmp");
serverFileUtils.copy(source,dstFile);
return parse(location,mimeType,dstFile);
return parse(location,mimeType,charset,dstFile);
} catch (Exception e) {
return null;
} finally {
if (dstFile != null) {dstFile.delete();}
}
}

public plasmaParserDocument parse(URL location, String mimeType, File sourceFile) throws ParserException, InterruptedException {
public plasmaParserDocument parse(URL location, String mimeType, String charset, File sourceFile) throws ParserException, InterruptedException {
RPMFile rpmFile = null;
try {
String summary = null, description = null, name = sourceFile.getName();
Expand Down Expand Up @@ -177,7 +177,7 @@ public static void main(String[] args) {
rpmParser testParser = new rpmParser();
byte[] content = httpc.singleGET(contentUrl, contentUrl.getHost(), 10000, null, null, null);
ByteArrayInputStream input = new ByteArrayInputStream(content);
testParser.parse(contentUrl, "application/x-rpm", input);
testParser.parse(contentUrl, "application/x-rpm", null, input);
} catch (Exception e) {
e.printStackTrace();
}
Expand Down
2 changes: 1 addition & 1 deletion source/de/anomic/plasma/parser/rss/rssParser.java
Expand Up @@ -100,7 +100,7 @@ public rssParser() {
parserName = "Rich Site Summary/Atom Feed Parser";
}

public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException, InterruptedException {
public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {

try {
LinkedList feedSections = new LinkedList();
Expand Down
2 changes: 1 addition & 1 deletion source/de/anomic/plasma/parser/rtf/rtfParser.java
Expand Up @@ -80,7 +80,7 @@ public rtfParser() {
parserName = "Rich Text Format Parser";
}

public plasmaParserDocument parse(URL location, String mimeType,
public plasmaParserDocument parse(URL location, String mimeType, String charset,
InputStream source) throws ParserException, InterruptedException {


Expand Down
4 changes: 2 additions & 2 deletions source/de/anomic/plasma/parser/tar/tarParser.java
Expand Up @@ -94,7 +94,7 @@ public Hashtable getSupportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}

public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException, InterruptedException {
public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {

try {
// creating a new parser class to parse the unzipped content
Expand Down Expand Up @@ -153,7 +153,7 @@ public plasmaParserDocument parse(URL location, String mimeType, InputStream sou
checkInterruption();

// parsing the content
theDoc = theParser.parseSource(new URL(tempFile),entryMime,tempFile);
theDoc = theParser.parseSource(new URL(tempFile),entryMime,null,tempFile);
} finally {
if (tempFile != null) try {tempFile.delete(); } catch(Exception ex){}
}
Expand Down

0 comments on commit d0a5a53

Please sign in to comment.