Skip to content

Commit

Permalink
*) favicons that are specified in the document content via html link-…
Browse files Browse the repository at this point in the history
…tags

   are now detected and displayed on the search page (requested by allo).

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3845 6c8d7289-2bf4-0310-a012-ef5d649a1542
  • Loading branch information
theli committed Jun 9, 2007
1 parent 854eb14 commit 339153d
Show file tree
Hide file tree
Showing 10 changed files with 87 additions and 18 deletions.
7 changes: 7 additions & 0 deletions htroot/js/yacysearch.js
Expand Up @@ -65,6 +65,7 @@ function handleTextState(req) {
var snippetText = response.getElementsByTagName("text")[0].firstChild.data;
var urlHash = response.getElementsByTagName("urlHash")[0].firstChild.data;
var status = response.getElementsByTagName("status")[0].firstChild.data;


var span = document.getElementById("h" + urlHash);
removeAllChildren(span);
Expand All @@ -77,6 +78,12 @@ function handleTextState(req) {
span.parentNode.parentNode.setAttribute("style", "display: none");
document.getElementById("hidden_results").innerHTML='Some results were hidden, because they do not contain your searchwords anymore, or because they are not accessible. Click here to <a href="javascript:show_hidden_results()">show them</a>';
}

// set URL to favicon (if a link-tag was found in the document)
if (response.getElementsByTagName("favicon")[0].firstChild != null) {
var img = document.getElementById("f" + urlHash);
img.src = response.getElementsByTagName("favicon")[0].firstChild.data;
}

// replace "<b>" text by <strong> node
var pos1=snippetText.indexOf("<b>");
Expand Down
2 changes: 2 additions & 0 deletions htroot/xml/snippet.java
Expand Up @@ -70,6 +70,7 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
}
prop.put("link", 0);
prop.put("links", 0);
prop.putSafeXML("favicon",snippet.getFavicon()==null?"":snippet.getFavicon().toString());
} else {
// attach media information
ArrayList mediaSnippets = switchboard.snippetCache.retrieveMediaSnippets(url, queryHashes, media, true, mediasnippet_timeout);
Expand All @@ -85,6 +86,7 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
prop.put("text", "");
prop.put("link", mediaSnippets.size());
prop.put("links", mediaSnippets.size());
prop.put("favicon","");
}


Expand Down
1 change: 1 addition & 0 deletions htroot/xml/snippet.xml
Expand Up @@ -4,6 +4,7 @@
<status>#[status]#</status>
<urlHash>#[urlHash]#</urlHash>
<links>#[links]#</links>
<favicon>#[favicon]#</favicon>
#{link}#
<link>
<type>#[type]#</type>
Expand Down
17 changes: 10 additions & 7 deletions htroot/xml/util/getpageinfo_p.java
Expand Up @@ -66,6 +66,7 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
serverObjects prop = new serverObjects();
prop.put("sitemap", "");
prop.put("title", "");
prop.put("favicon","");
prop.put("robots-allowed", 3); //unknown
String actions="title";
if(post!=null && post.containsKey("url")){
Expand All @@ -90,27 +91,29 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
serverFileUtils.write(contentString,writer);
writer.close();

// put the document title
prop.put("title", scraper.getTitle());

// put the favicon that belongs to the document
prop.putSafeXML("favicon", (scraper.getFavicon()==null)?"":scraper.getFavicon().toString());

// put keywords
String list[]=scraper.getKeywords();
for(int i=0;i<list.length;i++){
prop.putSafeXML("tags_"+i+"_tag", list[i]);
}
prop.put("tags", list.length);

} catch (MalformedURLException e) {
} catch (IOException e) {
} catch (MalformedURLException e) { /* ignore this */
} catch (IOException e) { /* ignore this */
}
}
if(actions.indexOf("robots")>=0){
try {
URL theURL = new URL(url);

// determine if crawling of the current URL is allowed
if(robotsParser.isDisallowed(theURL)){
prop.put("robots-allowed", 0);
}else{
prop.put("robots-allowed", 1);
}
prop.put("robots-allowed", robotsParser.isDisallowed(theURL) ? 0:1);

// get the sitemap URL of the domain
URL sitemapURL = robotsParser.getSitemapURL(theURL);
Expand Down
1 change: 1 addition & 0 deletions htroot/xml/util/getpageinfo_p.xml
Expand Up @@ -3,6 +3,7 @@
<title>#[title]#</title>
<robots>#(robots-allowed)#0::1::#(/robots-allowed)#</robots>
<sitemap>#[sitemap]#</sitemap>
<favicon>#[favicon]#</favicon>
<tags>
#{tags}#
<tag name="#[tag]#" />
Expand Down
2 changes: 1 addition & 1 deletion htroot/yacysearch.html
Expand Up @@ -150,7 +150,7 @@ <h2>#[promoteSearchPageGreeting]#</h2>
#(/recommend)#
</div>
#(/authorized)#
<h4 class="linktitle"><img src="#[favicon]#" class="favicon" width="16" height="16" /><a href="#[url]#" target="_parent">#[description]#</a></h4>
<h4 class="linktitle"><img src="#[favicon]#" id="f#[urlhash]#" class="favicon" width="16" height="16" /><a href="#[url]#" target="_parent">#[description]#</a></h4>
<p class="snippet iconindented"><span class="#(snippet)#snippetLoading::snippetLoaded#(/snippet)#" id="h#[urlhash]#">#(snippet)#loading snippet ...::#[text]##(/snippet)#</span></p>
<p class="url iconindented"><a href="#[url]#" id="url#[urlhash]#" target="_parent">#[urlname]#</a></p>
<p class="urlinfo iconindented">#[date]# | YBR-#[ybr]# | <a href="ViewFile.html?urlHash=#[urlhash]#&amp;words=#[words]#">Info</a> | <a href="yacysearch.html?cat=image&amp;url=#[url]#&amp;search=#[former]#">Pictures</a></p>
Expand Down
18 changes: 17 additions & 1 deletion source/de/anomic/htmlFilter/htmlFilterContentScraper.java
Expand Up @@ -109,6 +109,14 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
private serverCharBuffer content;
private EventListenerList htmlFilterEventListeners = new EventListenerList();

/**
* {@link URL} to the favicon that belongs to the document
*/
private URL favicon;

/**
* The document root {@link URL}
*/
private URL root;

public htmlFilterContentScraper(URL root) {
Expand Down Expand Up @@ -207,7 +215,8 @@ public void scrapeTag0(String tagname, Properties tagopts) {

if (type.equalsIgnoreCase("shortcut icon")) {
htmlFilterImageEntry ie = new htmlFilterImageEntry(newLink, linktitle, -1,-1);
images.add(ie);
images.add(ie);
this.favicon = newLink;
} else if (!type.equalsIgnoreCase("stylesheet") && !type.equalsIgnoreCase("alternate stylesheet")) {
anchors.put(newLink.toString(), linktitle);
}
Expand Down Expand Up @@ -346,6 +355,13 @@ public TreeSet getImages() {
public Map getMetas() {
return metas;
}

/**
* @return the {@link URL} to the favicon that belongs to the document
*/
public URL getFavicon() {
return this.favicon;
}

public String getDescription() {
String s = (String) metas.get("description");
Expand Down
3 changes: 2 additions & 1 deletion source/de/anomic/plasma/plasmaParser.java
Expand Up @@ -765,7 +765,8 @@ public plasmaParserDocument transformScraper(URL location, String mimeType, Stri
scraper.getText(),
scraper.getAnchors(),
scraper.getImages());
//scraper.close();
//scraper.close();
ppd.setFavicon(scraper.getFavicon());
return ppd;
} catch (MalformedURLException e) {
//e.printStackTrace();
Expand Down
15 changes: 15 additions & 0 deletions source/de/anomic/plasma/plasmaParserDocument.java
Expand Up @@ -83,6 +83,7 @@ public class plasmaParserDocument {
// text in image tags.
private Map hyperlinks, audiolinks, videolinks, applinks;
private Map emaillinks;
private URL favicon;
private boolean resorted;
private InputStream textStream;

Expand Down Expand Up @@ -395,6 +396,20 @@ public void addSubDocument(plasmaParserDocument doc) throws IOException {
images.addAll(doc.getImages());
}

/**
* @return the {@link URL} to the favicon that belongs to the document
*/
public URL getFavicon() {
return this.favicon;
}

/**
* @param faviconURL the {@link URL} to the favicon that belongs to the document
*/
public void setFavicon(URL faviconURL) {
this.favicon = faviconURL;
}

public void close() {
// try close the output stream
if (this.textStream != null) {
Expand Down
39 changes: 31 additions & 8 deletions source/de/anomic/plasma/plasmaSnippetCache.java
Expand Up @@ -89,6 +89,15 @@ public class plasmaSnippetCache {
private int snippetsScoreCounter;
private kelondroMScoreCluster snippetsScore;
private HashMap snippetsCache;

/**
* a cache holding URLs to favicons specified by the page content, e.g. by using the html link-tag. e.g.
* <pre>
* &lt;link rel="shortcut icon" type="image/x-icon" href="../src/favicon.ico"&gt;
* </pre>
*/
private HashMap faviconCache;

private plasmaHTCache cacheManager;
private plasmaParser parser;
private serverLog log;
Expand All @@ -106,7 +115,8 @@ public plasmaSnippetCache(
this.sb = theSb;
this.snippetsScoreCounter = 0;
this.snippetsScore = new kelondroMScoreCluster();
this.snippetsCache = new HashMap();
this.snippetsCache = new HashMap();
this.faviconCache = new HashMap();
}

public class TextSnippet {
Expand All @@ -115,12 +125,19 @@ public class TextSnippet {
private String error;
private int errorCode;
private Set remaingHashes;
private URL favicon;

public TextSnippet(URL url, String line, int errorCode, Set remaingHashes, String errortext) {
this(url,line,errorCode,remaingHashes,errortext,null);
}

public TextSnippet(URL url, String line, int errorCode, Set remaingHashes, String errortext, URL favicon) {
this.url = url;
this.line = line;
this.errorCode = errorCode;
this.error = errortext;
this.remaingHashes = remaingHashes;
this.favicon = favicon;
}
public URL getUrl() {
return this.url;
Expand Down Expand Up @@ -213,6 +230,10 @@ else if(k == (w[j].length()-1)) {
}
return l.toString().trim();
}

public URL getFavicon() {
return this.favicon;
}
}

public class MediaSnippet {
Expand Down Expand Up @@ -244,9 +265,9 @@ public TextSnippet retrieveTextSnippet(URL url, Set queryhashes, boolean fetchOn
int source = SOURCE_CACHE;
String wordhashes = yacySearch.set2string(queryhashes);
String line = retrieveFromCache(wordhashes, urlhash);
if (line != null) {
if (line != null) {
//System.out.println("found snippet for URL " + url + " in cache: " + line);
return new TextSnippet(url, line, source, null, null);
return new TextSnippet(url, line, source, null, null,(URL)this.faviconCache.get(urlhash));
}

/* ===========================================================================
Expand Down Expand Up @@ -300,7 +321,7 @@ public TextSnippet retrieveTextSnippet(URL url, Set queryhashes, boolean fetchOn
* =========================================================================== */
plasmaParserDocument document = null;
try {
document = parseDocument(url, resContentLength, resContent, resInfo);
document = parseDocument(url, resContentLength, resContent, resInfo);
} catch (ParserException e) {
return new TextSnippet(url, null, ERROR_PARSER_FAILED, queryhashes, e.getMessage()); // cannot be parsed
} finally {
Expand All @@ -311,12 +332,14 @@ public TextSnippet retrieveTextSnippet(URL url, Set queryhashes, boolean fetchOn

/* ===========================================================================
* COMPUTE SNIPPET
* =========================================================================== */
* =========================================================================== */
URL resFavicon = document.getFavicon();
if (resFavicon != null) this.faviconCache.put(urlhash,resFavicon);
// we have found a parseable non-empty file: use the lines

// compute snippet from text
final Iterator sentences = document.getSentences(pre);
if (sentences == null) return new TextSnippet(url, null, ERROR_PARSER_NO_LINES, queryhashes, "parser returned no sentences");
if (sentences == null) return new TextSnippet(url, null, ERROR_PARSER_NO_LINES, queryhashes, "parser returned no sentences",resFavicon);
Object[] tsr = computeTextSnippet(sentences, queryhashes, snippetMaxLength);
String textline = (tsr == null) ? null : (String) tsr[0];
Set remainingHashes = (tsr == null) ? queryhashes : (Set) tsr[1];
Expand All @@ -335,13 +358,13 @@ public TextSnippet retrieveTextSnippet(URL url, Set queryhashes, boolean fetchOn
//if (hrefline != null) line += (line.length() == 0) ? hrefline : "<br />" + hrefline;
if (textline != null) line += (line.length() == 0) ? textline : "<br />" + textline;

if ((line == null) || (remainingHashes.size() > 0)) return new TextSnippet(url, null, ERROR_NO_MATCH, remainingHashes, "no matching snippet found");
if ((line == null) || (remainingHashes.size() > 0)) return new TextSnippet(url, null, ERROR_NO_MATCH, remainingHashes, "no matching snippet found",resFavicon);
if (line.length() > snippetMaxLength) line = line.substring(0, snippetMaxLength);

// finally store this snippet in our own cache
storeToCache(wordhashes, urlhash, line);
document.close();
return new TextSnippet(url, line, source, null, null);
return new TextSnippet(url, line, source, null, null, resFavicon);
}

/**
Expand Down

0 comments on commit 339153d

Please sign in to comment.