*) favicons that are specified in the document content via html link-…

…tags are now detected and displayed on the search page (requested by allo). git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3845 6c8d7289-2bf4-0310-a012-ef5d649a1542
yacy · Jun 9, 2007 · 339153d · 339153d
1 parent 854eb14
commit 339153d
Show file tree

Hide file tree

Showing 10 changed files with 87 additions and 18 deletions.
diff --git a/htroot/js/yacysearch.js b/htroot/js/yacysearch.js
@@ -65,6 +65,7 @@ function handleTextState(req) {
 	var snippetText = response.getElementsByTagName("text")[0].firstChild.data;
 	var urlHash = response.getElementsByTagName("urlHash")[0].firstChild.data;
 	var status = response.getElementsByTagName("status")[0].firstChild.data;
+
 
 	var span = document.getElementById("h" + urlHash);
 	removeAllChildren(span);
@@ -77,6 +78,12 @@ function handleTextState(req) {
 		span.parentNode.parentNode.setAttribute("style", "display: none");
 		document.getElementById("hidden_results").innerHTML='Some results were hidden, because they do not contain your searchwords anymore, or because they are not accessible. Click here to <a href="javascript:show_hidden_results()">show them</a>';
 	}
+
+	// set URL to favicon (if a link-tag was found in the document)
+	if (response.getElementsByTagName("favicon")[0].firstChild != null) {
+		var img = document.getElementById("f" + urlHash);
+		img.src = response.getElementsByTagName("favicon")[0].firstChild.data;
+	}
 
 	// replace "<b>" text by <strong> node
 	var pos1=snippetText.indexOf("<b>");

diff --git a/htroot/xml/snippet.java b/htroot/xml/snippet.java
@@ -70,6 +70,7 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
             }
             prop.put("link", 0);
             prop.put("links", 0);
+            prop.putSafeXML("favicon",snippet.getFavicon()==null?"":snippet.getFavicon().toString());
         } else {
             // attach media information
             ArrayList mediaSnippets = switchboard.snippetCache.retrieveMediaSnippets(url, queryHashes, media, true, mediasnippet_timeout);
@@ -85,6 +86,7 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
             prop.put("text", "");
             prop.put("link", mediaSnippets.size());
             prop.put("links", mediaSnippets.size());
+            prop.put("favicon","");
         }
 
 

diff --git a/htroot/xml/snippet.xml b/htroot/xml/snippet.xml
@@ -4,6 +4,7 @@
 	<status>#[status]#</status>
 	<urlHash>#[urlHash]#</urlHash>
 	<links>#[links]#</links>
+	<favicon>#[favicon]#</favicon>
 	#{link}#
 	<link>
 		<type>#[type]#</type>

diff --git a/htroot/xml/util/getpageinfo_p.java b/htroot/xml/util/getpageinfo_p.java
@@ -66,6 +66,7 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
         serverObjects prop = new serverObjects();
         prop.put("sitemap", "");
         prop.put("title", "");
+        prop.put("favicon","");
         prop.put("robots-allowed", 3); //unknown
         String actions="title";
         if(post!=null && post.containsKey("url")){
@@ -90,27 +91,29 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
                     serverFileUtils.write(contentString,writer);
                     writer.close();
 
+                    // put the document title 
                     prop.put("title", scraper.getTitle());
+
+                    // put the favicon that belongs to the document
+                    prop.putSafeXML("favicon", (scraper.getFavicon()==null)?"":scraper.getFavicon().toString());
+
+                    // put keywords
                     String list[]=scraper.getKeywords();
                     for(int i=0;i<list.length;i++){
                     	prop.putSafeXML("tags_"+i+"_tag", list[i]);
                     }
                     prop.put("tags", list.length);
 
-                } catch (MalformedURLException e) {
-                } catch (IOException e) {
+                } catch (MalformedURLException e) { /* ignore this */
+                } catch (IOException e) { /* ignore this */
                 }
             }
             if(actions.indexOf("robots")>=0){
                 try {
                 	URL theURL = new URL(url);
 
                 	// determine if crawling of the current URL is allowed
-                    if(robotsParser.isDisallowed(theURL)){
-                        prop.put("robots-allowed", 0);
-                    }else{
-                        prop.put("robots-allowed", 1);
-                    }
+                	prop.put("robots-allowed", robotsParser.isDisallowed(theURL) ? 0:1);
 
                     // get the sitemap URL of the domain
                     URL sitemapURL = robotsParser.getSitemapURL(theURL);

diff --git a/htroot/xml/util/getpageinfo_p.xml b/htroot/xml/util/getpageinfo_p.xml
@@ -3,6 +3,7 @@
   <title>#[title]#</title>
   <robots>#(robots-allowed)#0::1::#(/robots-allowed)#</robots>
   <sitemap>#[sitemap]#</sitemap>
+  <favicon>#[favicon]#</favicon>
   <tags>
     #{tags}#
     <tag name="#[tag]#" />

diff --git a/htroot/yacysearch.html b/htroot/yacysearch.html
@@ -150,7 +150,7 @@ <h2>#[promoteSearchPageGreeting]#</h2>
 	         #(/recommend)#
 	       </div>
 	       #(/authorized)#
-	       <h4 class="linktitle"><img src="#[favicon]#" class="favicon" width="16" height="16" /><a href="#[url]#" target="_parent">#[description]#</a></h4>
+	       <h4 class="linktitle"><img src="#[favicon]#" id="f#[urlhash]#" class="favicon" width="16" height="16" /><a href="#[url]#" target="_parent">#[description]#</a></h4>
 	       <p class="snippet iconindented"><span class="#(snippet)#snippetLoading::snippetLoaded#(/snippet)#" id="h#[urlhash]#">#(snippet)#loading snippet ...::#[text]##(/snippet)#</span></p>
 	       <p class="url iconindented"><a href="#[url]#" id="url#[urlhash]#" target="_parent">#[urlname]#</a></p>
 	       <p class="urlinfo iconindented">#[date]# | YBR-#[ybr]# | <a href="ViewFile.html?urlHash=#[urlhash]#&amp;words=#[words]#">Info</a> | <a href="yacysearch.html?cat=image&amp;url=#[url]#&amp;search=#[former]#">Pictures</a></p>

diff --git a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
@@ -109,6 +109,14 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
     private serverCharBuffer content;
     private EventListenerList htmlFilterEventListeners = new EventListenerList();
 
+    /**
+     * {@link URL} to the favicon that belongs to the document
+     */
+    private URL favicon;
+
+    /**
+     * The document root {@link URL} 
+     */
     private URL root;
 
     public htmlFilterContentScraper(URL root) {
@@ -207,7 +215,8 @@ public void scrapeTag0(String tagname, Properties tagopts) {
 
                 if (type.equalsIgnoreCase("shortcut icon")) {
                     htmlFilterImageEntry ie = new htmlFilterImageEntry(newLink, linktitle, -1,-1);
-                    images.add(ie);                
+                    images.add(ie);    
+                    this.favicon = newLink;
                 } else if (!type.equalsIgnoreCase("stylesheet") && !type.equalsIgnoreCase("alternate stylesheet")) {
                     anchors.put(newLink.toString(), linktitle);                    
                 }
@@ -346,6 +355,13 @@ public TreeSet getImages() {
     public Map getMetas() {
         return metas;
     }
+
+    /**
+     * @return the {@link URL} to the favicon that belongs to the document
+     */    
+    public URL getFavicon() {
+    	return this.favicon;
+    }
 
     public String getDescription() {
         String s = (String) metas.get("description");

diff --git a/source/de/anomic/plasma/plasmaParser.java b/source/de/anomic/plasma/plasmaParser.java
@@ -765,7 +765,8 @@ public plasmaParserDocument transformScraper(URL location, String mimeType, Stri
                     scraper.getText(),
                     scraper.getAnchors(),
                     scraper.getImages());
-            //scraper.close();
+            //scraper.close();            
+            ppd.setFavicon(scraper.getFavicon());
             return ppd;
         } catch (MalformedURLException e) {
             //e.printStackTrace();

diff --git a/source/de/anomic/plasma/plasmaParserDocument.java b/source/de/anomic/plasma/plasmaParserDocument.java
@@ -83,6 +83,7 @@ public class plasmaParserDocument {
     // text in image tags.
     private Map hyperlinks, audiolinks, videolinks, applinks;
     private Map emaillinks;
+    private URL favicon;
     private boolean resorted;
     private InputStream textStream;
 
@@ -395,6 +396,20 @@ public void addSubDocument(plasmaParserDocument doc) throws IOException {
         images.addAll(doc.getImages());
     }
 
+    /**
+     * @return the {@link URL} to the favicon that belongs to the document
+     */
+    public URL getFavicon() {
+    	return this.favicon;
+    }
+
+    /**
+     * @param faviconURL the {@link URL} to the favicon that belongs to the document
+     */
+    public void setFavicon(URL faviconURL) {
+    	this.favicon = faviconURL;
+    }
+
     public void close() {
         // try close the output stream
         if (this.textStream != null) {

diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java
@@ -89,6 +89,15 @@ public class plasmaSnippetCache {
     private int                   snippetsScoreCounter;
     private kelondroMScoreCluster snippetsScore;
     private HashMap               snippetsCache;
+
+    /**
+     * a cache holding URLs to favicons specified by the page content, e.g. by using the html link-tag. e.g.
+     * <pre>
+     * 	 &lt;link rel="shortcut icon" type="image/x-icon" href="../src/favicon.ico"&gt;
+     * </pre>
+     */
+    private HashMap				  faviconCache;
+
     private plasmaHTCache         cacheManager;
     private plasmaParser          parser;
     private serverLog             log;
@@ -106,7 +115,8 @@ public plasmaSnippetCache(
         this.sb = theSb;
         this.snippetsScoreCounter = 0;
         this.snippetsScore = new kelondroMScoreCluster();
-        this.snippetsCache = new HashMap();        
+        this.snippetsCache = new HashMap(); 
+        this.faviconCache = new HashMap();
     }
 
     public class TextSnippet {
@@ -115,12 +125,19 @@ public class TextSnippet {
         private String error;
         private int errorCode;
         private Set remaingHashes;
+        private URL favicon;
+
         public TextSnippet(URL url, String line, int errorCode, Set remaingHashes, String errortext) {
+        	this(url,line,errorCode,remaingHashes,errortext,null);
+        }
+
+        public TextSnippet(URL url, String line, int errorCode, Set remaingHashes, String errortext, URL favicon) {
             this.url = url;
             this.line = line;
             this.errorCode = errorCode;
             this.error = errortext;
             this.remaingHashes = remaingHashes;
+            this.favicon = favicon;
         }
         public URL getUrl() {
             return this.url;
@@ -213,6 +230,10 @@ else if(k == (w[j].length()-1)) {
             }
             return l.toString().trim();
         }
+
+        public URL getFavicon() {
+        	return this.favicon;
+        }
     }
 
     public class MediaSnippet {
@@ -244,9 +265,9 @@ public TextSnippet retrieveTextSnippet(URL url, Set queryhashes, boolean fetchOn
         int source = SOURCE_CACHE;
         String wordhashes = yacySearch.set2string(queryhashes);
         String line = retrieveFromCache(wordhashes, urlhash);
-        if (line != null) {
+        if (line != null) {        	
             //System.out.println("found snippet for URL " + url + " in cache: " + line);
-            return new TextSnippet(url, line, source, null, null);
+            return new TextSnippet(url, line, source, null, null,(URL)this.faviconCache.get(urlhash));
         }
 
         /* ===========================================================================
@@ -300,7 +321,7 @@ public TextSnippet retrieveTextSnippet(URL url, Set queryhashes, boolean fetchOn
          * =========================================================================== */
         plasmaParserDocument document = null;
         try {
-             document = parseDocument(url, resContentLength, resContent, resInfo);            
+             document = parseDocument(url, resContentLength, resContent, resInfo);
         } catch (ParserException e) {
             return new TextSnippet(url, null, ERROR_PARSER_FAILED, queryhashes, e.getMessage()); // cannot be parsed
         } finally {
@@ -311,12 +332,14 @@ public TextSnippet retrieveTextSnippet(URL url, Set queryhashes, boolean fetchOn
 
         /* ===========================================================================
          * COMPUTE SNIPPET
-         * =========================================================================== */        
+         * =========================================================================== */    
+        URL resFavicon = document.getFavicon();
+        if (resFavicon != null) this.faviconCache.put(urlhash,resFavicon);
         // we have found a parseable non-empty file: use the lines
 
         // compute snippet from text
         final Iterator sentences = document.getSentences(pre);
-        if (sentences == null) return new TextSnippet(url, null, ERROR_PARSER_NO_LINES, queryhashes, "parser returned no sentences");
+        if (sentences == null) return new TextSnippet(url, null, ERROR_PARSER_NO_LINES, queryhashes, "parser returned no sentences",resFavicon);
         Object[] tsr = computeTextSnippet(sentences, queryhashes, snippetMaxLength);
         String textline = (tsr == null) ? null : (String) tsr[0];
         Set remainingHashes = (tsr == null) ? queryhashes : (Set) tsr[1];
@@ -335,13 +358,13 @@ public TextSnippet retrieveTextSnippet(URL url, Set queryhashes, boolean fetchOn
         //if (hrefline  != null) line += (line.length() == 0) ? hrefline  : "<br />" + hrefline;
         if (textline  != null) line += (line.length() == 0) ? textline  : "<br />" + textline;
 
-        if ((line == null) || (remainingHashes.size() > 0)) return new TextSnippet(url, null, ERROR_NO_MATCH, remainingHashes, "no matching snippet found");
+        if ((line == null) || (remainingHashes.size() > 0)) return new TextSnippet(url, null, ERROR_NO_MATCH, remainingHashes, "no matching snippet found",resFavicon);
         if (line.length() > snippetMaxLength) line = line.substring(0, snippetMaxLength);
 
         // finally store this snippet in our own cache
         storeToCache(wordhashes, urlhash, line);
         document.close();
-        return new TextSnippet(url, line, source, null, null);
+        return new TextSnippet(url, line, source, null, null, resFavicon);
     }
 
     /**