Skip to content

Commit

Permalink
fixed a problem with attribute flags on RWI entries that prevented pr…
Browse files Browse the repository at this point in the history
…oper selection of index-of constraint

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5437 6c8d7289-2bf4-0310-a012-ef5d649a1542
  • Loading branch information
orbiter committed Jan 4, 2009
1 parent 6072831 commit c4c4c22
Show file tree
Hide file tree
Showing 5 changed files with 55 additions and 24 deletions.
7 changes: 7 additions & 0 deletions htroot/ViewFile.html
Expand Up @@ -31,6 +31,7 @@ <h2>View URL Content</h2>
<option value="plain"#(vMode-plain)#:: selected="selected"#(/vMode-plain)#>Plain Text</option>
<option value="parsed"#(vMode-parsed)#:: selected="selected"#(/vMode-parsed)#>Parsed Text</option>
<option value="sentences"#(vMode-sentences)#:: selected="selected"#(/vMode-sentences)#>Parsed Sentences</option>
<option value="words"#(vMode-words)#:: selected="selected"#(/vMode-words)#>Parsed Tokens/Words</option>
<option value="links"#(vMode-links)#:: selected="selected"#(/vMode-links)#>Link List</option>
</select>
<input type="submit" name="show" value="Show" />
Expand Down Expand Up @@ -93,6 +94,12 @@ <h3>Link List</h3><br>
<td class="tt">#[attr]#</tt></td>
</tr>#{/links}#
</table>
:: <!-- 6 -->
<fieldset><legend>Parsed Tokens</legend>
<ol>#{words}#
<li class="tt">#[word]#</li>#{/words}#
</ol>
</fieldset>
#(/viewMode)#
</p>

Expand Down
31 changes: 30 additions & 1 deletion htroot/ViewFile.java
Expand Up @@ -29,6 +29,7 @@
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URLDecoder;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
Expand Down Expand Up @@ -59,6 +60,7 @@ public class ViewFile {
public static final int VIEW_MODE_AS_PARSED_SENTENCES = 3;
public static final int VIEW_MODE_AS_IFRAME = 4;
public static final int VIEW_MODE_AS_LINKLIST = 5;
public static final int VIEW_MODE_AS_PARSED_WORDS = 6;

private static final String HIGHLIGHT_CSS = "searchHighlight";
private static final int MAX_HIGHLIGHTS = 6;
Expand Down Expand Up @@ -240,7 +242,7 @@ public static serverObjects respond(final httpRequestHeader header, final server
prop.put("viewMode", VIEW_MODE_AS_IFRAME);
prop.put("viewMode_url", url.toNormalform(false, true));

} else if (viewMode.equals("parsed") || viewMode.equals("sentences") || viewMode.equals("links")) {
} else if (viewMode.equals("parsed") || viewMode.equals("sentences") || viewMode.equals("words") || viewMode.equals("links")) {
// parsing the resource content
plasmaParserDocument document = null;
try {
Expand Down Expand Up @@ -297,6 +299,33 @@ public static serverObjects respond(final httpRequestHeader header, final server
}
prop.put("viewMode_sentences", i);

} else if (viewMode.equals("words")) {
prop.put("viewMode", VIEW_MODE_AS_PARSED_WORDS);
final Iterator<StringBuilder> sentences = document.getSentences(pre);

boolean dark = true;
int i = 0;
String sentence, token;
if (sentences != null) {

// Search word highlighting
while (sentences.hasNext()) {
sentence = sentences.next().toString();
Enumeration<StringBuilder> tokens = plasmaCondenser.wordTokenizer(sentence, "UTF-8");
while (tokens.hasMoreElements()) {
token = tokens.nextElement().toString();
if (token.length() > 0) {
prop.put("viewMode_words_" + i + "_nr", i + 1);
prop.put("viewMode_words_" + i + "_word", token);
prop.put("viewMode_words_" + i + "_dark", dark ? "1" : "0");
dark = !dark;
i++;
}
}
}
}
prop.put("viewMode_words", i);

} else if (viewMode.equals("links")) {
prop.put("viewMode", VIEW_MODE_AS_LINKLIST);
boolean dark = true;
Expand Down
25 changes: 12 additions & 13 deletions source/de/anomic/plasma/plasmaCondenser.java
Expand Up @@ -108,13 +108,15 @@ public plasmaCondenser(final plasmaParserDocument document, final boolean indexT
this.wordcut = 2;
this.words = new TreeMap<String, indexWord>();
this.RESULT_FLAGS = new kelondroBitfield(4);

// construct flag set for document
if (document.getImages().size() > 0) RESULT_FLAGS.set(flag_cat_hasimage, true);
if (document.getAudiolinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasaudio, true);
if (document.getVideolinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasvideo, true);
if (document.getApplinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasapp, true);

this.languageIdentificator = new Identificator();

//System.out.println("DEBUG: condensing " + document.getMainLongTitle() + ", indexText=" + Boolean.toString(indexText) + ", indexMedia=" + Boolean.toString(indexMedia));

// add the URL components to the word list
insertTextToWords(document.dc_source().toNormalform(false, true), 0, indexRWIEntry.flag_app_dc_identifier, RESULT_FLAGS, false);

Map.Entry<yacyURL, String> entry;
if (indexText) {
Expand Down Expand Up @@ -161,6 +163,9 @@ public plasmaCondenser(final plasmaParserDocument document, final boolean indexT
this.RESULT_DIFF_SENTENCES = 0;
}

// add the URL components to the word list
insertTextToWords(document.dc_source().toNormalform(false, true), 0, indexRWIEntry.flag_app_dc_identifier, RESULT_FLAGS, false);

if (indexMedia) {
// add anchor descriptions: here, we also add the url components
// audio
Expand Down Expand Up @@ -209,12 +214,6 @@ public plasmaCondenser(final plasmaParserDocument document, final boolean indexT
}
}
}

// construct flag set for document
if (document.getImages().size() > 0) RESULT_FLAGS.set(flag_cat_hasimage, true);
if (document.getAudiolinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasaudio, true);
if (document.getVideolinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasvideo, true);
if (document.getApplinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasapp, true);
}

private void insertTextToWords(final String text, final int phrase, final int flagpos, final kelondroBitfield flagstemplate, boolean useForLanguageIdentification) {
Expand Down Expand Up @@ -360,7 +359,7 @@ private void createCondensement(final InputStream is, final String charset) thro
this.RESULT_FLAGS.set(flag_cat_indexof, true);
wordenum.pre(true); // parse lines as they come with CRLF
}
if ((last_index) && (word.equals("of"))) comb_indexof = true;
if ((last_index) && (wordminsize > 2 || (word.equals("of")))) comb_indexof = true;
last_last = word.equals("last");
last_index = word.equals("index");

Expand Down Expand Up @@ -491,10 +490,10 @@ public final static boolean invisible(final char c) {
else
return true;
}

public static Enumeration<StringBuilder> wordTokenizer(final String s, final String charset) {
try {
return new sievedWordsEnum(new ByteArrayInputStream(s.getBytes("UTF-8")));
return new sievedWordsEnum(new ByteArrayInputStream(s.getBytes(charset)));
} catch (final Exception e) {
return null;
}
Expand Down
14 changes: 5 additions & 9 deletions source/de/anomic/plasma/plasmaSnippetCache.java
Expand Up @@ -330,13 +330,9 @@ public static TextSnippet retrieveTextSnippet(final indexURLReference.Components
// trying to load the resource from the cache
resContent = plasmaHTCache.getResourceContentStream(url);
responseHeader = plasmaHTCache.loadResponseHeader(url);
if (resContent != null) {
// if the content was found
resContentLength = plasmaHTCache.getResourceContentLength(url);
if ((resContentLength > maxDocLen) && (!fetchOnline)) {
// content may be too large to be parsed here. To be fast, we omit calculation of snippet here
return new TextSnippet(url, null, ERROR_SOURCE_LOADING, queryhashes, "resource available, but too large: " + resContentLength + " bytes");
}
if (resContent != null && ((resContentLength = plasmaHTCache.getResourceContentLength(url)) > maxDocLen) && (!fetchOnline)) {
// content may be too large to be parsed here. To be fast, we omit calculation of snippet here
return new TextSnippet(url, null, ERROR_SOURCE_LOADING, queryhashes, "resource available, but too large: " + resContentLength + " bytes");
} else if (containsAllHashes(comp.dc_title(), queryhashes)) {
// try to create the snippet from information given in the url itself
return new TextSnippet(url, (comp.dc_subject().length() > 0) ? comp.dc_creator() : comp.dc_subject(), SOURCE_METADATA, null, null, faviconCache.get(url.hash()));
Expand All @@ -346,7 +342,7 @@ public static TextSnippet retrieveTextSnippet(final indexURLReference.Components
} else if (containsAllHashes(comp.dc_subject(), queryhashes)) {
// try to create the snippet from information given in the subject metadata
return new TextSnippet(url, (comp.dc_creator().length() > 0) ? comp.dc_creator() : comp.dc_subject(), SOURCE_METADATA, null, null, faviconCache.get(url.hash()));
} else if (containsAllHashes(comp.url().toNormalform(true, true), queryhashes)) {
} else if (containsAllHashes(comp.url().toNormalform(true, true).replace('-', ' '), queryhashes)) {
// try to create the snippet from information given in the subject metadata
return new TextSnippet(url, (comp.dc_creator().length() > 0) ? comp.dc_creator() : comp.dc_subject(), SOURCE_METADATA, null, null, faviconCache.get(url.hash()));
} else if (fetchOnline) {
Expand Down Expand Up @@ -673,7 +669,7 @@ private static String computeMediaSnippet(Map<yacyURL, String> media, Set<String
final int newlen = Math.max(10, maxpos - minpos + 10);
final int around = (maxLength - newlen) / 2;
assert minpos - around < sentence.length() : "maxpos = " + maxpos + ", minpos = " + minpos + ", around = " + around + ", sentence.length() = " + sentence.length();
assert ((maxpos + around) <= sentence.length()) && ((maxpos + around) <= sentence.length()) : "maxpos = " + maxpos + ", minpos = " + minpos + ", around = " + around + ", sentence.length() = " + sentence.length();
//assert ((maxpos + around) <= sentence.length()) && ((maxpos + around) <= sentence.length()) : "maxpos = " + maxpos + ", minpos = " + minpos + ", around = " + around + ", sentence.length() = " + sentence.length();
sentence = "[..] " + sentence.substring(minpos - around, ((maxpos + around) > sentence.length()) ? sentence.length() : (maxpos + around)).trim() + " [..]";
minpos = around;
maxpos = sentence.length() - around - 5;
Expand Down
2 changes: 1 addition & 1 deletion source/de/anomic/yacy/yacyPeerSelection.java
Expand Up @@ -100,7 +100,7 @@ public acceptRemoteIndexSeedEnum(yacySeedDB seedDB, final String starthash, int
this.remaining = max;
this.doublecheck = new HashSet<String>();
this.nextSeed = nextInternal();
this.alsoMyOwn = alsoMyOwn && (kelondroBase64Order.enhancedCoder.compare(seedDB.mySeed().hash.getBytes(), nextSeed.hash.getBytes()) > 0);
this.alsoMyOwn = alsoMyOwn && nextSeed != null && (kelondroBase64Order.enhancedCoder.compare(seedDB.mySeed().hash.getBytes(), nextSeed.hash.getBytes()) > 0);
}

public boolean hasNext() {
Expand Down

0 comments on commit c4c4c22

Please sign in to comment.