Skip to content

Commit

Permalink
some refactoring of topic generation
Browse files Browse the repository at this point in the history
git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6018 6c8d7289-2bf4-0310-a012-ef5d649a1542
  • Loading branch information
orbiter committed Jun 3, 2009
1 parent f28f62f commit 15fad76
Show file tree
Hide file tree
Showing 5 changed files with 22 additions and 51 deletions.
2 changes: 1 addition & 1 deletion htroot/yacy/search.java
Expand Up @@ -321,7 +321,7 @@ public static serverObjects respond(final httpRequestHeader header, final server

// prepare reference hints
final long timer = System.currentTimeMillis();
final ArrayList<NavigatorEntry> ws = theSearch.topics(10);
final ArrayList<NavigatorEntry> ws = theSearch.getTopicNavigator(10);
final StringBuilder refstr = new StringBuilder();
for (NavigatorEntry e: ws) {
refstr.append(",").append(e.name);
Expand Down
39 changes: 7 additions & 32 deletions htroot/yacysearchtrailer.java
Expand Up @@ -26,15 +26,11 @@

import java.util.ArrayList;
import java.util.Iterator;
import java.util.TreeSet;

import de.anomic.http.httpRequestHeader;
import de.anomic.kelondro.order.NaturalOrder;
import de.anomic.kelondro.util.SetTools;
import de.anomic.plasma.plasmaProfiling;
import de.anomic.plasma.plasmaSearchEvent;
import de.anomic.plasma.plasmaSearchQuery;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaSearchRankingProcess.NavigatorEntry;
import de.anomic.server.serverObjects;
import de.anomic.server.serverProfiling;
Expand Down Expand Up @@ -83,38 +79,17 @@ public static serverObjects respond(final httpRequestHeader header, final server
}

// attach the bottom line with search references (topwords)
final ArrayList<NavigatorEntry> references = theSearch.topics(20);
final ArrayList<NavigatorEntry> references = theSearch.getTopicNavigator(10);
if (references.size() > 0) {
// get the topwords
final TreeSet<String> topwords = new TreeSet<String>(NaturalOrder.naturalComparator);
for (NavigatorEntry e: references) {
if (e.name.matches("[a-z]+")) {
topwords.add(e.name);
}
}

// filter out the badwords
final TreeSet<String> filteredtopwords = SetTools.joinConstructive(topwords, plasmaSwitchboard.badwords);
if (filteredtopwords.size() > 0) {
SetTools.excludeDestructive(topwords, plasmaSwitchboard.badwords);
}

// avoid stopwords being topwords
if (env.getConfig("filterOutStopwordsFromTopwords", "true").equals("true")) {
if ((plasmaSwitchboard.stopwords != null) && (plasmaSwitchboard.stopwords.size() > 0)) {
SetTools.excludeDestructive(topwords, plasmaSwitchboard.stopwords);
}
}

String word;
int hintcount = 0;
final Iterator<String> iter = topwords.iterator();
NavigatorEntry e;
Iterator<NavigatorEntry> iter = references.iterator();
while (iter.hasNext()) {
word = iter.next();
e = iter.next();
if (/*(theQuery == null) ||*/ (theQuery.queryString == null)) break;
if (word != null) {
prop.putHTML("words_" + hintcount + "_word", word);
prop.putHTML("words_" + hintcount + "_newsearch", theQuery.queryString.replace(' ', '+') + "+" + word);
if (e.name != null) {
prop.putHTML("words_" + hintcount + "_word", e.name);
prop.putHTML("words_" + hintcount + "_newsearch", theQuery.queryString.replace(' ', '+') + "+" + e.name);
prop.put("words_" + hintcount + "_count", theQuery.displayResults());
prop.put("words_" + hintcount + "_offset", "0");
prop.put("words_" + hintcount + "_display", display);
Expand Down
21 changes: 7 additions & 14 deletions source/de/anomic/plasma/plasmaSearchEvent.java
Expand Up @@ -98,7 +98,6 @@ public final class plasmaSearchEvent {
long urlRetrievalAllTime;
long snippetComputationAllTime;
public ResultURLs crawlResults;
private ArrayList<NavigatorEntry> hostNavigator;

@SuppressWarnings("unchecked")
private plasmaSearchEvent(final plasmaSearchQuery query,
Expand All @@ -124,7 +123,6 @@ private plasmaSearchEvent(final plasmaSearchQuery query,
this.snippetComputationAllTime = 0;
this.workerThreads = null;
this.localSearchThread = null;
this.hostNavigator = null;
this.result = new SortStore<ResultEntry>(-1); // this is the result, enriched with snippets, ranked and ordered by ranking
this.images = new SortStore<plasmaSnippetCache.MediaSnippet>(-1);
this.failedURLs = new HashMap<String, String>(); // a map of urls to reason strings where a worker thread tried to work on, but failed.
Expand Down Expand Up @@ -578,13 +576,12 @@ private void registerFailure(final String urlhash, final String reason) {
}

public ArrayList<NavigatorEntry> getHostNavigator(int maxentries) {
if (this.hostNavigator != null) return this.hostNavigator;
if (localSearchThread != null && localSearchThread.isAlive()) {
try {Thread.sleep(100L);} catch (final InterruptedException e) {}
}
this.hostNavigator = rankedCache.getHostNavigator(10);
if (this.hostNavigator.size() == 0) this.hostNavigator = null;
return this.hostNavigator;
return this.rankedCache.getHostNavigator(maxentries);
}

public ArrayList<NavigatorEntry> getTopicNavigator(final int maxentries) {
// returns a set of words that are computed as toplist
return this.rankedCache.getTopicNavigator(maxentries);
}

public ResultEntry oneResult(final int item) {
Expand Down Expand Up @@ -730,6 +727,7 @@ void prepareSecondarySearch() {
if (peer.equals(mypeerhash)) continue; // we dont need to ask ourself
urls = entry1.getValue();
words = wordsFromPeer(peer, urls);
assert words.length() >= 12 : "words = " + words;
//System.out.println("DEBUG-INDEXABSTRACT ***: peer " + peer + " has urls: " + urls);
//System.out.println("DEBUG-INDEXABSTRACT ***: peer " + peer + " from words: " + words);
secondarySearchThreads[c++] = yacySearch.secondaryRemoteSearch(
Expand Down Expand Up @@ -776,11 +774,6 @@ public void remove(final String urlhash) {
//assert e != null;
}

public ArrayList<NavigatorEntry> topics(final int count) {
// returns a set of words that are computed as toplist
return this.rankedCache.getTopicNavigator(count);
}

public static class ResultEntry {
// payload objects
private final URLMetadataRow urlentry;
Expand Down
9 changes: 6 additions & 3 deletions source/de/anomic/plasma/plasmaSearchRankingProcess.java
Expand Up @@ -459,9 +459,12 @@ public void addTopic(final String[] words) {
for (int i = 0; i < words.length; i++) {
word = words[i].toLowerCase();
Integer c;
if ((word.length() > 2) &&
("http_html_php_ftp_www_com_org_net_gov_edu_index_home_page_for_usage_the_and_".indexOf(word) < 0) &&
(!(query.queryHashes.contains(Word.word2hash(word))))) {
if (word.length() > 2 &&
"http_html_php_ftp_www_com_org_net_gov_edu_index_home_page_for_usage_the_and_".indexOf(word) < 0 &&
!query.queryHashes.contains(Word.word2hash(word)) &&
word.matches("[a-z]+") &&
!plasmaSwitchboard.badwords.contains(word) &&
!plasmaSwitchboard.stopwords.contains(word)) {
c = ref.get(word);
if (c == null) ref.put(word, 1); else ref.put(word, c.intValue() + 1);
}
Expand Down
2 changes: 1 addition & 1 deletion source/de/anomic/yacy/yacySearch.java
Expand Up @@ -289,7 +289,7 @@ public static yacySearch secondaryRemoteSearch(
final String targethash, final Blacklist blacklist,
final plasmaSearchRankingProfile rankingProfile,
final Bitfield constraint, final TreeMap<byte[], String> clusterselection) {
assert wordhashes.length() >= 12;
assert wordhashes.length() >= 12 : "wordhashes = " + wordhashes;

// check own peer status
if (peers.mySeed() == null || peers.mySeed().getPublicAddress() == null) { return null; }
Expand Down

0 comments on commit 15fad76

Please sign in to comment.