small change

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1027 6c8d7289-2bf4-0310-a012-ef5d649a1542
yacy · Nov 4, 2005 · 544e4ea · 544e4ea
1 parent 00ab4d8
commit 544e4ea
Show file tree

Hide file tree

Showing 2 changed files with 95 additions and 77 deletions.
diff --git a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
@@ -3,7 +3,10 @@
 // (C) by Michael Peter Christen; mc@anomic.de
 // first published on http://www.anomic.de
 // Frankfurt, Germany, 2004
-// last major change: 18.02.2004
+//
+// $LastChangedDate$
+// $LastChangedRevision$
+// $LastChangedBy$
 //
 // This program is free software; you can redistribute it and/or modify
 // it under the terms of the GNU General Public License as published by
@@ -48,32 +51,31 @@
 import java.util.Map;
 import java.util.Properties;
 import java.util.TreeSet;
-
+import de.anomic.server.logging.serverLog;
 import de.anomic.server.serverByteBuffer;
 
-
 public class htmlFilterContentScraper extends htmlFilterAbstractScraper implements htmlFilterScraper {
 
-
     // statics: for initialisation of the HTMLFilterAbstractScraper
     private static TreeSet linkTags0;
     private static TreeSet linkTags1;
+
     private static final Collator insensitiveCollator = Collator.getInstance(Locale.US);
     static {
-	insensitiveCollator.setStrength(Collator.SECONDARY);
-	insensitiveCollator.setDecomposition(Collator.NO_DECOMPOSITION);
+        insensitiveCollator.setStrength(Collator.SECONDARY);
+        insensitiveCollator.setDecomposition(Collator.NO_DECOMPOSITION);
     }
-    
+
     static {
-	linkTags0 = new TreeSet(insensitiveCollator);
-	linkTags0.add("img");
+        linkTags0 = new TreeSet(insensitiveCollator);
+        linkTags0.add("img");
         linkTags0.add("base");
         linkTags0.add("frame");
 
-	linkTags1 = new TreeSet(insensitiveCollator);
-	linkTags1.add("a");
-	linkTags1.add("h1");
-	linkTags1.add("title");
+        linkTags1 = new TreeSet(insensitiveCollator);
+        linkTags1.add("a");
+        linkTags1.add("h1");
+        linkTags1.add("title");
     }
 
     // class variables: collectors for links
@@ -87,103 +89,120 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
     public htmlFilterContentScraper(URL root) {
         // the root value here will not be used to load the resource.
         // it is only the reference for relative links
-	super(linkTags0, linkTags1);
-	this.root = root;
-	this.anchors = new HashMap();
-	this.images = new HashMap();
-	this.title = "";
-	this.headline = "";
-	this.text = new serverByteBuffer(1024);
+        super(linkTags0, linkTags1);
+        this.root = root;
+        this.anchors = new HashMap();
+        this.images = new HashMap();
+        this.title = "";
+        this.headline = "";
+        this.text = new serverByteBuffer(1024);
     }
 
     public void scrapeText(byte[] newtext) {
-	//System.out.println("SCRAPE: " + new String(newtext));
-	if ((text.length() != 0) && (text.byteAt(text.length() - 1) != 32)) text.append(32);
-	text.append(super.stripAll(new serverByteBuffer(newtext, newtext.length + 1)).trim()).append(32);
+//      System.out.println("SCRAPE: " + new String(newtext));
+        if ((text.length() != 0) && (text.byteAt(text.length() - 1) != 32)) text.append(32);
+        text.append(super.stripAll(new serverByteBuffer(newtext, newtext.length + 1)).trim()).append(32);
     }
 
     public static String urlNormalform(URL url) {
         if (url == null) return null;
         return urlNormalform(url.toString());
     }
-    
+
     public static String urlNormalform(String us) {
-        if (us == null) return null;
-        if (us.length() == 0) return null;
-
+        serverLog.logFiner("htmlFilter", "urlNormalform:  IN=" + us);
+        if (us == null) { return null; }
+        if (us.length() == 0) { return null; }
+
         /* TODO: what about 
          * - case insensitive domain names
          * - chars that should be escaped in URLs
          */
-        int p;
-
+
         // cutting of everything behind #
-        if ((p = us.indexOf("#")) >= 0) us = us.substring(0, p);
-
-        if (us.startsWith("https")) {
-            if (us.endsWith(":443")) us = us.substring(0, us.length() - 4);
-            p = us.indexOf(":443/");
-            if (p >= 0) us = us.substring(0,p).concat(us.substring(p + 4));            
-        } else if (us.startsWith("http")) {
-            if (us.endsWith(":80")) us = us.substring(0, us.length() - 3);
-            p = us.indexOf(":80/");
-            if (p >= 0) us = us.substring(0,p).concat(us.substring(p + 3));
-        } 
+        int cpos = us.indexOf("#");
+        if (cpos >= 0) { us = us.substring(0, cpos); }
+
+        if (us.startsWith("http")) {
+            if (us.endsWith(":80")) {
+                us = us.substring(0, us.length() - 3);
+                serverLog.logFinest("htmlFilter", "urlNormalform:  :80=" + us);
+            } else {
+                cpos = us.indexOf(":80/");
+                if (cpos >= 0) {
+                    us = us.substring(0, cpos).concat(us.substring(cpos + 3));
+                    serverLog.logFinest("htmlFilter", "urlNormalform:  :80/=" + us);
+                }
+            }
+        } else if (us.startsWith("https")) {
+            if (us.endsWith(":443")) {
+                us = us.substring(0, us.length() - 4);
+                serverLog.logFinest("htmlFilter", "urlNormalform:  :443=" + us);
+            } else {
+                cpos = us.indexOf(":443/");
+                if (cpos >= 0) {
+                    us = us.substring(0, cpos).concat(us.substring(cpos + 4));
+                    serverLog.logFinest("htmlFilter", "urlNormalform:  :443/=" + us);
+                }
+            }
+        }
         if (((us.endsWith("/")) && (us.lastIndexOf('/', us.length() - 2) < 8))) us = us.substring(0, us.length() - 1);
+        serverLog.logFiner("htmlFilter", "urlNormalform: OUT=" + us);        
         return us;
-    }        
-    
+    }
+
     private String absolutePath(String relativePath) {
-	try {
-	    return urlNormalform(new URL(root, relativePath));
-	} catch (Exception e) {
-	    return "";
-	}
+        try {
+            return urlNormalform(new URL(root, relativePath));
+        } catch (Exception e) {
+            return "";
+        }
     }
 
     public void scrapeTag0(String tagname, Properties tagopts) {
-	if (tagname.equalsIgnoreCase("img")) images.put(absolutePath(tagopts.getProperty("src", "")), tagopts.getProperty("alt",""));
+        if (tagname.equalsIgnoreCase("img")) images.put(absolutePath(tagopts.getProperty("src", "")), tagopts.getProperty("alt",""));
         if (tagname.equalsIgnoreCase("base")) try {root = new URL(tagopts.getProperty("href", ""));} catch (MalformedURLException e) {}
         if (tagname.equalsIgnoreCase("frame")) anchors.put(absolutePath(tagopts.getProperty("src", "")), tagopts.getProperty("name",""));
     }
 
     public void scrapeTag1(String tagname, Properties tagopts, byte[] text) {
-	//System.out.println("ScrapeTag1: tagname=" + tagname + ", opts=" + tagopts.toString() + ", text=" + new String(text));
-	if ((tagname.equalsIgnoreCase("a")) && (text.length < 2048)) anchors.put(absolutePath(tagopts.getProperty("href", "")), super.stripAll(new serverByteBuffer(text)).trim().toString());
-	if ((tagname.equalsIgnoreCase("h1")) && (text.length < 1024)) headline = super.stripAll(new serverByteBuffer(text)).toString();
-	if ((tagname.equalsIgnoreCase("title")) && (text.length < 1024)) title = super.stripAll(new serverByteBuffer(text)).toString();        
+//      System.out.println("ScrapeTag1: tagname=" + tagname + ", opts=" + tagopts.toString() + ", text=" + new String(text));
+        if ((tagname.equalsIgnoreCase("a")) && (text.length < 2048)) anchors.put(absolutePath(tagopts.getProperty("href", "")), super.stripAll(new serverByteBuffer(text)).trim().toString());
+        if ((tagname.equalsIgnoreCase("h1")) && (text.length < 1024)) headline = super.stripAll(new serverByteBuffer(text)).toString();
+        if ((tagname.equalsIgnoreCase("title")) && (text.length < 1024)) title = super.stripAll(new serverByteBuffer(text)).toString();        
     }
 
     public String getHeadline() {
-	String hl = "";
+        String hl = "";
 
         // extract headline from content
-	if (title.length() > 0) hl = title.trim();
-	else if (headline.length() > 0) hl = headline.trim();
-	else if (text.length() > 80) hl = new String(text.getBytes(), 0, 80).trim();
-	else hl = text.trim().toString();
+        if (title.length() > 0) hl = title.trim();
+        else if (headline.length() > 0) hl = headline.trim();
+        else if (text.length() > 80) hl = new String(text.getBytes(), 0, 80).trim();
+        else hl = text.trim().toString();
 
         // clean the line: may contain too many funny symbols
         for (int i = 0; i < hl.length(); i++)
             if (hl.charAt(i) < ' ') hl = hl.substring(0, i) + " " + hl.substring(i + 1);
+
         // clean the line: remove double-spaces
         int p;
         while ((p = hl.indexOf("  ")) >= 0) hl = hl.substring(0, p) + hl.substring(p + 1);        
-        
+
         // return result
-	return hl.trim();
+        return hl.trim();
     }
 
     public byte[] getText() {
-	return text.getBytes();
+        return text.getBytes();
     }
-    
+
     public Map getAnchors() {
-	return anchors;
+        return anchors;
     }
 
     public Map getImages() {
-	return images;
+        return images;
     }
 
     public void close() {
@@ -196,23 +215,22 @@ public void close() {
         text = null;
         root = null;
     }
-    
+
     public void print() {
-	System.out.println("TITLE   :" + title);
-	System.out.println("HEADLINE:" + headline);
-	System.out.println("ANCHORS :" + anchors.toString());
-	System.out.println("IMAGES  :" + images.toString());
-	System.out.println("TEXT    :" + new String(text.getBytes()));
+    System.out.println("TITLE   :" + title);
+    System.out.println("HEADLINE:" + headline);
+    System.out.println("ANCHORS :" + anchors.toString());
+    System.out.println("IMAGES  :" + images.toString());
+    System.out.println("TEXT    :" + new String(text.getBytes()));
     }
 
-
     public static void main(String[] args) {
-	String test = "Nokia kürzt bei Forschung und Entwicklung";
+        String test = "Nokia kürzt bei Forschung und Entwicklung";
         try {
             htmlFilterContentScraper scraper = new htmlFilterContentScraper(new URL("http://localhost"));
             scraper.scrapeText(test.getBytes());
             System.out.println(new String(scraper.getText()));
         } catch (MalformedURLException e) {}
     }
-    
-}
+
+}
diff --git a/source/de/anomic/plasma/plasmaWordIndexEntry.java b/source/de/anomic/plasma/plasmaWordIndexEntry.java
@@ -134,10 +134,10 @@ public static char docType(String mime) {
         // serverLog.logFinest("PLASMA", "docType mime=" + mime);
         char doctype = DT_UNKNOWN;
         if (mime == null) doctype = DT_UNKNOWN;
-        else if (mime.endsWith("/gif")) doctype = DT_IMAGE;
-        else if (mime.endsWith("/jpg")) doctype = DT_IMAGE;
+        else if (mime.startsWith("image/")) doctype = DT_IMAGE;
+/*      else if (mime.endsWith("/gif")) doctype = DT_IMAGE;
         else if (mime.endsWith("/jpeg")) doctype = DT_IMAGE;
-        else if (mime.endsWith("/png")) doctype = DT_IMAGE;
+        else if (mime.endsWith("/png")) doctype = DT_IMAGE; */
         else if (mime.endsWith("/html")) doctype = DT_HTML;
         else if (mime.endsWith("/rtf")) doctype = DT_DOC;
         else if (mime.endsWith("/pdf")) doctype = DT_PDFPS;
@@ -147,7 +147,7 @@ public static char docType(String mime) {
         else if (mime.endsWith("/mspowerpoint")) doctype = DT_DOC;
         else if (mime.endsWith("/postscript")) doctype = DT_PDFPS;
         else if (mime.startsWith("text/")) doctype = DT_TEXT;
-        else if (mime.startsWith("image/")) doctype = DT_IMAGE;
+//      else if (mime.startsWith("image/")) doctype = DT_IMAGE;
         else if (mime.startsWith("audio/")) doctype = DT_AUDIO;
         else if (mime.startsWith("video/")) doctype = DT_MOVIE;
         //bz2     = application/x-bzip2