Skip to content

Commit

Permalink
fix more encooding problems in yacysearch.rss.
Browse files Browse the repository at this point in the history
- URL encoding for search terms where required
- removed "ugly" CDATA escaping
- UTF-8 encoding for the XML
- no HTML style escaping for XML/RSS element values
Note: some unicode characters might still be encooded in a wrong way.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4140 6c8d7289-2bf4-0310-a012-ef5d649a1542
  • Loading branch information
fuchsi committed Oct 4, 2007
1 parent 6b00fe0 commit c5a8585
Show file tree
Hide file tree
Showing 5 changed files with 63 additions and 24 deletions.
10 changes: 8 additions & 2 deletions htroot/yacysearch.java
Expand Up @@ -51,6 +51,7 @@
import java.util.HashMap;
import java.util.TreeSet;

import de.anomic.data.htmlTools;
import de.anomic.http.httpHeader;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBitfield;
Expand Down Expand Up @@ -381,8 +382,8 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
// adding some additional properties needed for the rss feed
String hostName = (String) header.get("Host", "localhost");
if (hostName.indexOf(":") == -1) hostName += ":" + serverCore.getPortNr(env.getConfig("port", "8080"));
prop.put("searchBaseURL", "http://" + hostName + "/yacysearch.html");
prop.put("rssYacyImageURL", "http://" + hostName + "/env/grafics/yacy.gif");
prop.putASIS("searchBaseURL", "http://" + hostName + "/yacysearch.html");
prop.putASIS("rssYacyImageURL", "http://" + hostName + "/env/grafics/yacy.gif");
}

prop.put("searchagain", (global) ? 1 : 0);
Expand All @@ -408,6 +409,11 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
prop.put("input_contentdomCheckImage", (contentdomCode == plasmaSearchQuery.CONTENTDOM_IMAGE) ? 1 : 0);
prop.put("input_contentdomCheckApp", (contentdomCode == plasmaSearchQuery.CONTENTDOM_APP) ? 1 : 0);

// for RSS: don't HTML encode some elements
String q = htmlTools.encodeUnicode2xml(post.get("search", ""));
prop.put("rss_query", q);
prop.put("rss_queryenc", yacyURL.escape(q));

// return rewrite properties
return prop;
}
Expand Down
14 changes: 7 additions & 7 deletions htroot/yacysearch.rss
@@ -1,24 +1,24 @@
<?xml version="1.0"?>
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type='text/xsl' href='/yacysearch.xsl' version='1.0'?>
<rss version="2.0"
xmlns:yacyTopwords="http://www.yacy.net/yacy/topwords"
xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/"
xmlns:atom="http://www.w3.org/2005/Atom">
<!-- YaCy P2P Web Search - Results; http://yacy.net -->
<channel>
<title>YaCy P2P-Search for #[former]#</title>
<description>Search for #[former]#</description>
<link>#[searchBaseURL]#?search=#[former]#&amp;resource=#[input_resource]#&amp;contentdom=#[input_contentdom]#</link>
<title>YaCy P2P-Search for #[rss_query]#</title>
<description>Search for #[rss_query]#</description>
<link>#[searchBaseURL]#?search=#[rss_queryenc]#&amp;resource=#[input_resource]#&amp;contentdom=#[input_contentdom]#</link>
<image>
<url>#[rssYacyImageURL]#</url>
<title>Search for #[former]#</title>
<link>#[searchBaseURL]#?search=#[former]#&amp;resource=#[input_resource]#&amp;contentdom=#[input_contentdom]#</link>
<title>Search for #[rss_query]#</title>
<link>#[searchBaseURL]#?search=#[rss_queryenc]#&amp;resource=#[input_resource]#&amp;contentdom=#[input_contentdom]#</link>
</image>
<opensearch:totalResults>#[num-results_totalcount]#</opensearch:totalResults>
<opensearch:startIndex>#[num-results_offset]#</opensearch:startIndex>
<opensearch:itemsPerPage>#[num-results_itemsPerPage]#</opensearch:itemsPerPage>
<atom:link rel="related" href="opensearchdescription.xml" type="application/opensearchdescription+xml"/>
<opensearch:Query role="request" searchTerms="#[former]#" />
<opensearch:Query role="request" searchTerms="#[rss_queryenc]#" />

#{results}#
<!--#include virtual="yacysearchitem.html?rss=true&item=#[item]#&eventID=#[eventID]#" -->
Expand Down
4 changes: 2 additions & 2 deletions htroot/yacysearchitem.html
Expand Up @@ -41,9 +41,9 @@ <h4 class="linktitle">
#(/content)#
#(rss)#::
<item>
<title><![CDATA[#[title]#]]></title>
<title>#[title]#</title>
<link>#[link]#</link>
<description><![CDATA[#[description]#]]></description>
<description>#[description]#></description>
<pubDate>#[date]#</pubDate>
<guid isPermaLink="false">#[urlhash]#</guid>
</item>
Expand Down
5 changes: 3 additions & 2 deletions htroot/yacysearchitem.java
Expand Up @@ -32,6 +32,7 @@
import java.util.Set;
import java.util.TreeSet;

import de.anomic.data.htmlTools;
import de.anomic.http.httpHeader;
import de.anomic.kelondro.kelondroMSetTools;
import de.anomic.kelondro.kelondroNaturalOrder;
Expand Down Expand Up @@ -174,8 +175,8 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
if (rss) {
// text search for rss output
prop.put("rss", 1); // switch on specific content
prop.put("rss_title", result.title());
prop.put("rss_description", result.textSnippet().getLineRaw());
prop.putASIS("rss_title", htmlTools.encodeUnicode2xml(result.title()));
prop.putASIS("rss_description", htmlTools.encodeUnicode2xml(result.textSnippet().getLineRaw()));
prop.put("rss_link", result.urlstring());
prop.put("rss_urlhash", result.hash());
prop.put("rss_date", plasmaSwitchboard.dateString822(result.modified()));
Expand Down
54 changes: 43 additions & 11 deletions source/de/anomic/data/htmlTools.java
Expand Up @@ -2,30 +2,62 @@

public class htmlTools {

/** Replaces characters in a string with other characters defined in an array.
/** Replaces characters in a string with other entities according to HTML standards.
* @param text a string that possibly contains special characters
* @param entities array that contains characters to be replaced and characters it will be replaced by
* @param includingAmpersand if <code>false</code> ampersands are not encoded
* @return the string with all characters replaced by the corresponding character from array
*/
//[FB], changes by [MN], re-implemented by [MC]
public static String encodeUnicode2html(String text, boolean includingAmpersand) {
if (text == null) return null;
int pos = 0;
int spos = (includingAmpersand ? 0 : 2);
int epos = mapping.length;

return encode(text, mapping, spos, epos);
}

/**
* Replaces special entities ampersand, quotation marks, and less than/graiter than
* by the escaping entities allowed in XML documents.
*
* @param text the original String
* @return the encoded String
*/
public static String encodeUnicode2xml(String text) {
if (text == null) return null;
int spos = 0;
int epos = 8;

return encode(text, mapping, spos, epos);
}

/**
* Generic method that replaces occurences of special character entities defined in map
* array with their corresponding mapping.
* @param text The String too process.
* @param map An array defining the entity mapping.
* @param spos It is possible to use a subset of the map only. This parameter defines the
* starting point in the map array.
* @param epos The ending point, see above.
* @return A copy of the original String with all entities defined in map replaced.
*/
public static String encode(String text, final String[] map, int spos, int epos) {
StringBuffer sb = new StringBuffer(text.length());
search: while (pos < text.length()) {
search: while (spos < text.length()) {
// find a (forward) mapping
loop: for (int i = (includingAmpersand) ? 0 : 2; i < mapping.length; i += 2) {
if (text.charAt(pos) != mapping[i].charAt(0)) continue loop;
loop: for (int i = spos; i < epos; i += 2) {
if (text.charAt(spos) != map[i].charAt(0)) continue loop;
// found match
sb.append(mapping[i + 1]);
pos++;
sb.append(map[i + 1]);
spos++;
continue search;
}
// not found match
sb.append(text.charAt(pos));
pos++;
sb.append(text.charAt(spos));
spos++;
}
return new String(sb);

return sb.toString();
}

public static String decodeHtml2Unicode(String text) {
Expand Down

0 comments on commit c5a8585

Please sign in to comment.