*) changes needed for multi-language support

- parsers may need to know the charset of the byte stream git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2591 6c8d7289-2bf4-0310-a012-ef5d649a1542
yacy · Sep 15, 2006 · d0a5a53 · d0a5a53
1 parent 31d6cde
commit d0a5a53
Show file tree

Hide file tree

Showing 23 changed files with 133 additions and 65 deletions.
diff --git a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
@@ -47,6 +47,8 @@
 import de.anomic.net.URL;
 
 import java.net.MalformedURLException;
+import java.nio.charset.Charset;
+import java.nio.charset.UnsupportedCharsetException;
 import java.text.Collator;
 import java.util.ArrayList;
 import java.util.HashMap;
@@ -93,7 +95,9 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
     //private String headline;
     private List[] headlines;
     private serverByteBuffer content;
+
     private URL root;
+    private String charset = "UTF-8";
 
     public htmlFilterContentScraper(URL root) {
         // the root value here will not be used to load the resource.
@@ -109,6 +113,14 @@ public htmlFilterContentScraper(URL root) {
         this.content = new serverByteBuffer(1024);
     }
 
+    public void setCharset(String charset) throws UnsupportedCharsetException {
+        // testing if charset exists
+        Charset.forName(charset);
+
+        // remember it
+        this.charset = charset;
+    }
+
     public void scrapeText(byte[] newtext) {
         // System.out.println("SCRAPE: " + new String(newtext));
         if ((content.length() != 0) && (content.byteAt(content.length() - 1) != 32)) content.append(32);

diff --git a/source/de/anomic/plasma/cache/IResourceInfo.java b/source/de/anomic/plasma/cache/IResourceInfo.java
@@ -82,6 +82,12 @@ public interface IResourceInfo {
      */
     public String getMimeType();
 
+    /**
+     * Returns the charset of the resource
+     * @return returns the name of the charset or <code>null</code> if unknown
+     */
+    public String getCharSet();
+
     /**
      * Returns the modification date of the cached object
      * @return the modifiaction date

diff --git a/source/de/anomic/plasma/cache/ftp/ResourceInfo.java b/source/de/anomic/plasma/cache/ftp/ResourceInfo.java
@@ -161,4 +161,8 @@ public boolean validResponseStatus(String responseStatus) {
         return responseStatus != null && responseStatus.equalsIgnoreCase("OK");
     }
 
+    public String getCharSet() {
+        return null;
+    }
+
 }
diff --git a/source/de/anomic/plasma/cache/http/ResourceInfo.java b/source/de/anomic/plasma/cache/http/ResourceInfo.java
@@ -110,6 +110,15 @@ public String getMimeType() {
         int pos = mimeType.indexOf(';');
         return ((pos < 0) ? mimeType : mimeType.substring(0, pos));          
     }
+
+    public String getCharSet() {
+        if (this.responseHeader == null) return null;
+
+        String mimeType = this.responseHeader.mime();
+
+        int pos = mimeType.indexOf(';');
+        return ((pos < 0) ? null : mimeType.substring(pos));          
+    }
 
     /**
      * @see de.anomic.plasma.cache.IResourceInfo#getModificationDate()

diff --git a/source/de/anomic/plasma/parser/AbstractParser.java b/source/de/anomic/plasma/parser/AbstractParser.java
@@ -103,6 +103,7 @@ public static final void checkInterruption() throws InterruptedException {
 	 * Parsing a document available as byte array.
      * @param location the origin of the document 
      * @param mimeType the mimetype of the document
+     * @param charset the supposed charset of the document or <code>null</code> if unkown
      * @param source the content byte array
      * @return a {@link plasmaParserDocument} containing the extracted plain text of the document
      * and some additional metadata.
@@ -113,12 +114,13 @@ public static final void checkInterruption() throws InterruptedException {
 	public plasmaParserDocument parse(
             URL location, 
             String mimeType,
+            String charset,
             byte[] source
     ) throws ParserException, InterruptedException {
         ByteArrayInputStream contentInputStream = null;
         try {
             contentInputStream = new ByteArrayInputStream(source);
-            return this.parse(location,mimeType,contentInputStream); 
+            return this.parse(location,mimeType,charset,contentInputStream); 
         } finally {
             if (contentInputStream != null) {
                 try {
@@ -133,19 +135,24 @@ public plasmaParserDocument parse(
 	 * Parsing a document stored in a {@link File}
      * @param location the origin of the document 
      * @param mimeType the mimetype of the document
+     * @param charset the supposed charset of the document or <code>null</code> if unkown
      * @param sourceFile the file containing the content of the document
      * @return a {@link plasmaParserDocument} containing the extracted plain text of the document
      * and some additional metadata.
 	 * @throws ParserException if the content could not be parsed properly 
 	 * 
 	 * @see de.anomic.plasma.parser.Parser#parse(de.anomic.net.URL, java.lang.String, java.io.File)
 	 */
-	public plasmaParserDocument parse(URL location, String mimeType,
-			File sourceFile) throws ParserException, InterruptedException {
+	public plasmaParserDocument parse(
+            URL location, 
+            String mimeType,
+            String charset,
+			File sourceFile
+	) throws ParserException, InterruptedException {
         BufferedInputStream contentInputStream = null;
         try {
             contentInputStream = new BufferedInputStream(new FileInputStream(sourceFile));
-            return this.parse(location, mimeType, contentInputStream);
+            return this.parse(location, mimeType, charset, contentInputStream);
         } catch (FileNotFoundException e) {
             throw new ParserException(e.getMessage());
         } finally {
@@ -157,15 +164,15 @@ public plasmaParserDocument parse(URL location, String mimeType,
      * Parsing a document available as {@link InputStream}
      * @param location the origin of the document 
      * @param mimeType the mimetype of the document
+     * @param charset the supposed charset of the document or <code>null</code> if unkown
      * @param source the {@link InputStream} containing the document content
      * @return a {@link plasmaParserDocument} containing the extracted plain text of the document
      * and some additional metadata.
      * @throws ParserException if the content could not be parsed properly 
      * 
      * @see de.anomic.plasma.parser.Parser#parse(de.anomic.net.URL, java.lang.String, java.io.InputStream)
      */
-    public abstract plasmaParserDocument parse(URL location, String mimeType,
-			InputStream source) throws ParserException, InterruptedException;
+    public abstract plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException;
 
     /**
      * @return Returns a list of library names that are needed by this parser

diff --git a/source/de/anomic/plasma/parser/Parser.java b/source/de/anomic/plasma/parser/Parser.java
@@ -64,39 +64,42 @@ public interface Parser {
      * Parsing a document available as byte array
      * @param location the origin of the document 
      * @param mimeType the mimetype of the document
+     * @param charset the supposed charset of the document or <code>null</code> if unkown
      * @param source the content byte array
      * @return a {@link plasmaParserDocument} containing the extracted plain text of the document
      * and some additional metadata.
      *  
      * @throws ParserException if the content could not be parsed properly 
      */
-    public plasmaParserDocument parse(URL location, String mimeType, byte[] source)
+    public plasmaParserDocument parse(URL location, String mimeType, String charset, byte[] source)
     throws ParserException, InterruptedException;
 
     /**
      * Parsing a document stored in a {@link File}
      * @param location the origin of the document 
      * @param mimeType the mimetype of the document
+     * @param charset the supposed charset of the document or <code>null</code> if unkown 
      * @param sourceFile the file containing the content of the document
      * @return a {@link plasmaParserDocument} containing the extracted plain text of the document
      * and some additional metadata.
      *  
      * @throws ParserException if the content could not be parsed properly 
      */    
-    public plasmaParserDocument parse(URL location, String mimeType, File sourceFile)
+    public plasmaParserDocument parse(URL location, String mimeType, String charset, File sourceFile)
     throws ParserException, InterruptedException;
 
     /**
      * Parsing a document available as {@link InputStream}
      * @param location the origin of the document 
      * @param mimeType the mimetype of the document
+     * @param charset the supposed charset of the document or <code>null</code> if unkown 
      * @param source the {@link InputStream} containing the document content
      * @return a {@link plasmaParserDocument} containing the extracted plain text of the document
      * and some additional metadata.
      *  
      * @throws ParserException if the content could not be parsed properly 
      */    
-    public plasmaParserDocument parse(URL location, String mimeType, InputStream source) 
+    public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) 
     throws ParserException, InterruptedException;
 
     /**

diff --git a/source/de/anomic/plasma/parser/bzip/bzipParser.java b/source/de/anomic/plasma/parser/bzip/bzipParser.java
@@ -87,7 +87,7 @@ public Hashtable getSupportedMimeTypes() {
         return SUPPORTED_MIME_TYPES;
     }
 
-    public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException, InterruptedException {
+    public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {
 
         File tempFile = null;
         try {           
@@ -126,7 +126,7 @@ public plasmaParserDocument parse(URL location, String mimeType, InputStream sou
 
             // creating a new parser class to parse the unzipped content
             plasmaParser theParser = new plasmaParser();
-            return theParser.parseSource(location,null,tempFile);
+            return theParser.parseSource(location,null,null,tempFile);
         } catch (Exception e) {  
             if (e instanceof InterruptedException) throw (InterruptedException) e;
             throw new ParserException("Unable to parse the gzip content. " + e.getMessage());

diff --git a/source/de/anomic/plasma/parser/doc/docParser.java b/source/de/anomic/plasma/parser/doc/docParser.java
@@ -78,7 +78,7 @@ public docParser() {
         parserName = "Word Document Parser";
 	}
 
-	public plasmaParserDocument parse(URL location, String mimeType,
+	public plasmaParserDocument parse(URL location, String mimeType, String charset,
 			InputStream source) throws ParserException, InterruptedException {
 
 

diff --git a/source/de/anomic/plasma/parser/gzip/gzipParser.java b/source/de/anomic/plasma/parser/gzip/gzipParser.java
@@ -83,7 +83,7 @@ public Hashtable getSupportedMimeTypes() {
         return SUPPORTED_MIME_TYPES;
     }
 
-    public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException, InterruptedException {
+    public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {
 
         File tempFile = null;
         try {           
@@ -110,7 +110,7 @@ public plasmaParserDocument parse(URL location, String mimeType, InputStream sou
 
             // creating a new parser class to parse the unzipped content
             plasmaParser theParser = new plasmaParser();
-            return theParser.parseSource(location,null,tempFile);
+            return theParser.parseSource(location,null,null,tempFile);
         } catch (Exception e) {    
             if (e instanceof InterruptedException) throw (InterruptedException) e;
             throw new ParserException("Unable to parse the gzip content. " + e.getMessage());

diff --git a/source/de/anomic/plasma/parser/mimeType/mimeTypeParser.java b/source/de/anomic/plasma/parser/mimeType/mimeTypeParser.java
@@ -125,7 +125,7 @@ public String getMimeType (File sourceFile) {
         return null;        
     }
 
-    public plasmaParserDocument parse(URL location, String mimeType, File sourceFile) throws ParserException, InterruptedException {
+    public plasmaParserDocument parse(URL location, String mimeType, String charset, File sourceFile) throws ParserException, InterruptedException {
 
         String orgMimeType = mimeType;
 
@@ -168,7 +168,7 @@ public plasmaParserDocument parse(URL location, String mimeType, File sourceFile
 
                 // parsing the content using the determined mimetype
                 plasmaParser theParser = new plasmaParser();
-                return theParser.parseSource(location,mimeType,sourceFile);
+                return theParser.parseSource(location,mimeType,charset,sourceFile);
             }
             return null;
 
@@ -185,13 +185,13 @@ public plasmaParserDocument parse(URL location, String mimeType, File sourceFile
         }
     }
 
-    public plasmaParserDocument parse(URL location, String mimeType,
+    public plasmaParserDocument parse(URL location, String mimeType,String charset,
             InputStream source) throws ParserException {
         File dstFile = null;
         try {
             dstFile = File.createTempFile("mimeTypeParser",".tmp");
             serverFileUtils.copy(source,dstFile);
-            return parse(location,mimeType,dstFile);
+            return parse(location,mimeType,charset,dstFile);
         } catch (Exception e) {            
             return null;
         } finally {

diff --git a/source/de/anomic/plasma/parser/odt/odtParser.java b/source/de/anomic/plasma/parser/odt/odtParser.java
@@ -91,7 +91,7 @@ public Hashtable getSupportedMimeTypes() {
         return SUPPORTED_MIME_TYPES;
     }
 
-    public plasmaParserDocument parse(URL location, String mimeType, File dest) throws ParserException, InterruptedException {
+    public plasmaParserDocument parse(URL location, String mimeType, String charset, File dest) throws ParserException, InterruptedException {
 
         try {          
             byte[] docContent     = null;
@@ -168,7 +168,7 @@ public plasmaParserDocument parse(URL location, String mimeType, File dest) thro
         }
     }
 
-    public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException {
+    public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException {
         File dest = null;
         try {
             // creating a tempfile
@@ -179,7 +179,7 @@ public plasmaParserDocument parse(URL location, String mimeType, InputStream sou
             serverFileUtils.copy(source, dest);
 
             // parsing the content
-            return parse(location, mimeType, dest);
+            return parse(location, mimeType, charset, dest);
         } catch (Exception e) {
             throw new ParserException("Unable to parse the odt document. " + e.getMessage());
         } finally {
@@ -210,7 +210,7 @@ public static void main(String[] args) {
             ByteArrayInputStream input = new ByteArrayInputStream(content);
 
             // parsing the document
-            testParser.parse(contentUrl, "application/vnd.oasis.opendocument.text", input);            
+            testParser.parse(contentUrl, "application/vnd.oasis.opendocument.text", null, input);            
         } catch (Exception e) {
             e.printStackTrace();
         }

diff --git a/source/de/anomic/plasma/parser/pdf/pdfParser.java b/source/de/anomic/plasma/parser/pdf/pdfParser.java
@@ -85,7 +85,7 @@ public Hashtable getSupportedMimeTypes() {
         return SUPPORTED_MIME_TYPES;
     }
 
-    public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException, InterruptedException {
+    public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {
 
 
         PDDocument theDocument = null;

diff --git a/source/de/anomic/plasma/parser/rpm/rpmParser.java b/source/de/anomic/plasma/parser/rpm/rpmParser.java
@@ -91,21 +91,21 @@ public Hashtable getSupportedMimeTypes() {
         return SUPPORTED_MIME_TYPES;
     }
 
-    public plasmaParserDocument parse(URL location, String mimeType,
+    public plasmaParserDocument parse(URL location, String mimeType, String charset,
             InputStream source) throws ParserException {
         File dstFile = null;
         try {
             dstFile = File.createTempFile("rpmParser",".tmp");
             serverFileUtils.copy(source,dstFile);
-            return parse(location,mimeType,dstFile);
+            return parse(location,mimeType,charset,dstFile);
         } catch (Exception e) {            
             return null;
         } finally {
             if (dstFile != null) {dstFile.delete();}            
         }        
     }    
 
-    public plasmaParserDocument parse(URL location, String mimeType, File sourceFile) throws ParserException, InterruptedException {
+    public plasmaParserDocument parse(URL location, String mimeType, String charset, File sourceFile) throws ParserException, InterruptedException {
         RPMFile rpmFile = null;        
         try {
             String summary = null, description = null, name = sourceFile.getName();
@@ -177,7 +177,7 @@ public static void main(String[] args) {
             rpmParser testParser = new rpmParser();
             byte[] content = httpc.singleGET(contentUrl, contentUrl.getHost(), 10000, null, null, null);
             ByteArrayInputStream input = new ByteArrayInputStream(content);
-            testParser.parse(contentUrl, "application/x-rpm", input);
+            testParser.parse(contentUrl, "application/x-rpm", null, input);
         } catch (Exception e) {
             e.printStackTrace();
         }

diff --git a/source/de/anomic/plasma/parser/rss/rssParser.java b/source/de/anomic/plasma/parser/rss/rssParser.java
@@ -100,7 +100,7 @@ public rssParser() {
         parserName = "Rich Site Summary/Atom Feed Parser"; 
 	}
 
-	public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException, InterruptedException {
+	public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {
 
         try {
             LinkedList feedSections = new LinkedList();

diff --git a/source/de/anomic/plasma/parser/rtf/rtfParser.java b/source/de/anomic/plasma/parser/rtf/rtfParser.java
@@ -80,7 +80,7 @@ public rtfParser() {
         parserName = "Rich Text Format Parser";  
 	}
 
-	public plasmaParserDocument parse(URL location, String mimeType,
+	public plasmaParserDocument parse(URL location, String mimeType, String charset,
 			InputStream source) throws ParserException, InterruptedException {
 
 

diff --git a/source/de/anomic/plasma/parser/tar/tarParser.java b/source/de/anomic/plasma/parser/tar/tarParser.java
@@ -94,7 +94,7 @@ public Hashtable getSupportedMimeTypes() {
         return SUPPORTED_MIME_TYPES;
     }
 
-    public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException, InterruptedException {
+    public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {
 
         try {           
             // creating a new parser class to parse the unzipped content
@@ -153,7 +153,7 @@ public plasmaParserDocument parse(URL location, String mimeType, InputStream sou
                     checkInterruption();
 
                     // parsing the content                    
-                    theDoc = theParser.parseSource(new URL(tempFile),entryMime,tempFile);
+                    theDoc = theParser.parseSource(new URL(tempFile),entryMime,null,tempFile);
                 } finally {
                     if (tempFile != null) try {tempFile.delete(); } catch(Exception ex){}
                 }