Skip to content

Commit

Permalink
update to snippet marker:
Browse files Browse the repository at this point in the history
- do not display indexed html (solves xss issues)
the single words are analyzed for already marked parts. this is needed to avoid false encoding of the marker (<b>) tags.
- improved speed for existing routine
heavy used regex pattern are precompiled now

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5322 6c8d7289-2bf4-0310-a012-ef5d649a1542
  • Loading branch information
lotus committed Nov 8, 2008
1 parent 16723d0 commit fad044f
Showing 1 changed file with 118 additions and 54 deletions.
172 changes: 118 additions & 54 deletions source/de/anomic/plasma/plasmaSnippetCache.java
Expand Up @@ -35,7 +35,10 @@
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import de.anomic.htmlFilter.htmlFilterCharacterCoding;
import de.anomic.htmlFilter.htmlFilterImageEntry;
import de.anomic.http.HttpClient;
import de.anomic.http.httpResponseHeader;
Expand Down Expand Up @@ -102,6 +105,27 @@ public static class TextSnippet {
Set<String> remaingHashes;
private final yacyURL favicon;

/**
* <code>\\A[^\\p{L}\\p{N}].+</code>
*/
private final static Pattern p1 = Pattern.compile("\\A[^\\p{L}\\p{N}].+");
/**
* <code>.+[^\\p{L}\\p{N}]\\Z</code>
*/
private final static Pattern p2 = Pattern.compile(".+[^\\p{L}\\p{N}]\\Z");
/**
* <code>\\A[\\p{L}\\p{N}]+[^\\p{L}\\p{N}].+\\Z</code>
*/
private final static Pattern p3 = Pattern.compile("\\A[\\p{L}\\p{N}]+[^\\p{L}\\p{N}].+\\Z");
/**
* <code>[^\\p{L}\\p{N}]</code>
*/
private final static Pattern p4 = Pattern.compile("[^\\p{L}\\p{N}]");
/**
* <code>(.*?)(\\&lt;b\\&gt;.+?\\&lt;/b\\&gt;)(.*)</code>
*/
private final static Pattern p01 = Pattern.compile("(.*?)(\\<b\\>.+?\\</b\\>)(.*)"); // marked words are in <b>-tags

public TextSnippet(final yacyURL url, final String line, final int errorCode, final Set<String> remaingHashes, final String errortext) {
this(url,line,errorCode,remaingHashes,errortext,null);
}
Expand All @@ -117,6 +141,9 @@ public TextSnippet(final yacyURL url, final String line, final int errorCode, fi
public yacyURL getUrl() {
return this.url;
}
public yacyURL getFavicon() {
return this.favicon;
}
public boolean exists() {
return line != null;
}
Expand All @@ -142,60 +169,18 @@ public String getLineMarked(final Set<String> queryHashes) {
final Iterator<String> i = queryHashes.iterator();
String h;
final String[] w = line.split(" ");
String prefix = "";
String postfix = "";
int len = 0;
while (i.hasNext()) {
h = i.next();
for (int j = 0; j < w.length; j++) {
//ignore punctuation marks (contrib [MN])
//note to myself:
//For details on regex see "Mastering regular expressions" by J.E.F. Friedl
//especially p. 123 and p. 390/391 (in the German version of the 2nd edition)

prefix = "";
postfix = "";

// cut off prefix if it contains of non-characters or non-numbers
while(w[j].matches("\\A[^\\p{L}\\p{N}].+")) {
prefix = prefix + w[j].substring(0,1);
w[j] = w[j].substring(1);
}

// cut off postfix if it contains of non-characters or non-numbers
while(w[j].matches(".+[^\\p{L}\\p{N}]\\Z")) {
len = w[j].length();
postfix = w[j].substring(len-1,len) + postfix;
w[j] = w[j].substring(0,len-1);
}

//special treatment if there is a special character in the word
if(w[j].matches("\\A[\\p{L}\\p{N}]+[^\\p{L}\\p{N}].+\\Z")) {
String out = "";
String temp = "";
for(int k=0; k < w[j].length(); k++) {
//is character a special character?
if(w[j].substring(k,k+1).matches("[^\\p{L}\\p{N}]")) {
if (indexWord.word2hash(temp).equals(h)) temp = "<b>" + temp + "</b>";
out = out + temp + w[j].substring(k,k+1);
temp = "";
}
//last character
else if(k == (w[j].length()-1)) {
temp = temp + w[j].substring(k,k+1);
if (indexWord.word2hash(temp).equals(h)) temp = "<b>" + temp + "</b>";
out = out + temp;
temp = "";
}
else temp = temp + w[j].substring(k,k+1);
}
w[j] = out;
}

//end contrib [MN]
else if (indexWord.word2hash(w[j]).equals(h)) w[j] = "<b>" + w[j] + "</b>";

w[j] = prefix + w[j] + postfix;
final ArrayList<String> al = markedWordArrayList(w[j]); // mark special character separated words correctly if more than 1 word has to be marked
w[j] = "";
for (int k = 0; k < al.size(); k++) {
if(k % 2 == 0){ // word has not been marked
w[j] += getWordMarked(al.get(k), h);
} else { // word has been marked, do not encode again
w[j] += al.get(k);
}
}
}
}
final StringBuffer l = new StringBuffer(line.length() + queryHashes.size() * 8);
Expand All @@ -205,10 +190,89 @@ else if(k == (w[j].length()-1)) {
}
return l.toString().trim();
}

public yacyURL getFavicon() {
return this.favicon;

/**
* mark words with &lt;b&gt;-tags
* @param word the word to mark
* @param h the hash of the word to mark
* @return the marked word if hash matches, else the unmarked word
* @see #getLineMarked(Set)
*/
private static String getWordMarked(String word, String h){
//ignore punctuation marks (contrib [MN])
//note to myself:
//For details on regex see "Mastering regular expressions" by J.E.F. Friedl
//especially p. 123 and p. 390/391 (in the German version of the 2nd edition)

String prefix = "";
String postfix = "";
int len = 0;

// cut off prefix if it contains of non-characters or non-numbers
while(p1.matcher(word).find()) {
prefix = prefix + word.substring(0,1);
word = word.substring(1);
}

// cut off postfix if it contains of non-characters or non-numbers
while(p2.matcher(word).find()) {
len = word.length();
postfix = word.substring(len-1,len) + postfix;
word = word.substring(0,len-1);
}

//special treatment if there is a special character in the word
if(p3.matcher(word).find()) {
String out = "";
String temp = "";
for(int k=0; k < word.length(); k++) {
//is character a special character?
if(p4.matcher(word.substring(k,k+1)).find()) {
if (indexWord.word2hash(temp).equals(h)) temp = "<b>" + htmlFilterCharacterCoding.unicode2html(temp, false) + "</b>";
out = out + temp + htmlFilterCharacterCoding.unicode2html(word.substring(k,k+1), false);
temp = "";
}
//last character
else if(k == (word.length()-1)) {
temp = temp + word.substring(k,k+1);
if (indexWord.word2hash(temp).equals(h)) temp = "<b>" + htmlFilterCharacterCoding.unicode2html(temp, false) + "</b>";
out = out + temp;
temp = "";
}
else temp = temp + word.substring(k,k+1);
}
word = out;
}

//end contrib [MN]
else if (indexWord.word2hash(word).equals(h)) word = "<b>" + htmlFilterCharacterCoding.unicode2html(word, false) + "</b>";

word = htmlFilterCharacterCoding.unicode2html(prefix, false)
+ word
+ htmlFilterCharacterCoding.unicode2html(postfix, false);
return word;
}

/**
* words that already has been marked has index <code>(i % 2 == 1)</code>
* words that has not yet been marked has index <code>(i % 2 == 0)</code>
* @param string the String to be processed
* @return words that already has and has not yet been marked
* @author [DW], 08.11.2008
*/
private static ArrayList<String> markedWordArrayList(String string){
ArrayList<String> al = new java.util.ArrayList<String>(1);
Matcher m = p01.matcher(string);
while (m.find()) {
al.add(m.group(1));
al.add(m.group(2));
string = m.group(3); // the postfix
m = p01.matcher(string);
}
al.add(string);
return al;
}

}

public static class MediaSnippet {
Expand Down

0 comments on commit fad044f

Please sign in to comment.