diff --git a/htroot/yacy/search.java b/htroot/yacy/search.java index 041adf2133..63e40b6acd 100644 --- a/htroot/yacy/search.java +++ b/htroot/yacy/search.java @@ -32,7 +32,6 @@ import java.util.HashMap; import java.util.Iterator; import java.util.Map; -import java.util.Set; import java.util.TreeSet; import de.anomic.content.RSSMessage; @@ -49,6 +48,7 @@ import de.anomic.plasma.plasmaSearchRankingProfile; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSearchEvent.ResultEntry; +import de.anomic.plasma.plasmaSearchRankingProcess.NavigatorEntry; import de.anomic.server.serverCore; import de.anomic.server.serverObjects; import de.anomic.server.serverProfiling; @@ -321,11 +321,10 @@ public static serverObjects respond(final httpRequestHeader header, final server // prepare reference hints final long timer = System.currentTimeMillis(); - final Set ws = theSearch.references(10); + final ArrayList ws = theSearch.topics(10); final StringBuilder refstr = new StringBuilder(); - final Iterator j = ws.iterator(); - while (j.hasNext()) { - refstr.append(",").append(j.next()); + for (NavigatorEntry e: ws) { + refstr.append(",").append(e.name); } prop.put("references", (refstr.length() > 0) ? refstr.substring(1) : refstr.toString()); serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(theQuery.id(true), "reference collection", ws.size(), System.currentTimeMillis() - timer), false); diff --git a/htroot/yacysearchtrailer.java b/htroot/yacysearchtrailer.java index 8b6ed68778..1ce1c6dd89 100644 --- a/htroot/yacysearchtrailer.java +++ b/htroot/yacysearchtrailer.java @@ -26,7 +26,6 @@ import java.util.ArrayList; import java.util.Iterator; -import java.util.Set; import java.util.TreeSet; import de.anomic.http.httpRequestHeader; @@ -36,7 +35,7 @@ import de.anomic.plasma.plasmaSearchEvent; import de.anomic.plasma.plasmaSearchQuery; import de.anomic.plasma.plasmaSwitchboard; -import de.anomic.plasma.plasmaSearchRankingProcess.hostnaventry; +import de.anomic.plasma.plasmaSearchRankingProcess.NavigatorEntry; import de.anomic.server.serverObjects; import de.anomic.server.serverProfiling; import de.anomic.server.serverSwitch; @@ -65,17 +64,17 @@ public static serverObjects respond(final httpRequestHeader header, final server // compose search navigation - ArrayList hostNavigator = theSearch.getHostNavigator(10); + ArrayList hostNavigator = theSearch.getHostNavigator(10); if (hostNavigator == null) { prop.put("navigation", 0); } else { prop.put("navigation", 1); - hostnaventry entry; + NavigatorEntry entry; int i; for (i = 0; i < hostNavigator.size(); i++) { entry = hostNavigator.get(i); - prop.put("navigation_domains_" + i + "_domain", plasmaSearchQuery.navurla(0, display, theQuery, theQuery.urlMask, "site:" + entry.host) + entry.host + " (" + entry.count + ")"); - prop.putJSON("navigation_domains_" + i + "_domain-json", plasmaSearchQuery.navurla(0, display, theQuery, theQuery.urlMask, "site:" + entry.host) + entry.host + " (" + entry.count + ")"); + prop.put("navigation_domains_" + i + "_domain", plasmaSearchQuery.navurla(0, display, theQuery, theQuery.urlMask, "site:" + entry.name) + entry.name + " (" + entry.count + ")"); + prop.putJSON("navigation_domains_" + i + "_domain-json", plasmaSearchQuery.navurla(0, display, theQuery, theQuery.urlMask, "site:" + entry.name) + entry.name + " (" + entry.count + ")"); prop.put("navigation_domains_" + i + "_nl", 1); } i--; @@ -84,16 +83,13 @@ public static serverObjects respond(final httpRequestHeader header, final server } // attach the bottom line with search references (topwords) - final Set references = theSearch.references(20); + final ArrayList references = theSearch.topics(20); if (references.size() > 0) { // get the topwords final TreeSet topwords = new TreeSet(NaturalOrder.naturalComparator); - String tmp = ""; - final Iterator i = references.iterator(); - while (i.hasNext()) { - tmp = i.next(); - if (tmp.matches("[a-z]+")) { - topwords.add(tmp); + for (NavigatorEntry e: references) { + if (e.name.matches("[a-z]+")) { + topwords.add(e.name); } } diff --git a/source/de/anomic/plasma/plasmaSearchEvent.java b/source/de/anomic/plasma/plasmaSearchEvent.java index ce68b08404..67946cfe86 100644 --- a/source/de/anomic/plasma/plasmaSearchEvent.java +++ b/source/de/anomic/plasma/plasmaSearchEvent.java @@ -32,7 +32,6 @@ import java.util.HashMap; import java.util.Iterator; import java.util.Map; -import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; import java.util.concurrent.ConcurrentHashMap; @@ -53,7 +52,7 @@ import de.anomic.kelondro.util.Log; import de.anomic.plasma.parser.Word; import de.anomic.plasma.parser.Condenser; -import de.anomic.plasma.plasmaSearchRankingProcess.hostnaventry; +import de.anomic.plasma.plasmaSearchRankingProcess.NavigatorEntry; import de.anomic.plasma.plasmaSnippetCache.MediaSnippet; import de.anomic.server.serverProfiling; import de.anomic.yacy.yacySearch; @@ -99,7 +98,7 @@ public final class plasmaSearchEvent { long urlRetrievalAllTime; long snippetComputationAllTime; public ResultURLs crawlResults; - private ArrayList hostNavigator; + private ArrayList hostNavigator; @SuppressWarnings("unchecked") private plasmaSearchEvent(final plasmaSearchQuery query, @@ -559,7 +558,7 @@ public void run() { // place the result to the result vector if (!result.exists(resultEntry)) { result.push(resultEntry, Long.valueOf(rankedCache.getOrder().cardinal(resultEntry.word()))); - rankedCache.addReferences(resultEntry); + rankedCache.addTopics(resultEntry); } //System.out.println("DEBUG SNIPPET_LOADING: thread " + id + " got " + resultEntry.url()); } @@ -579,7 +578,7 @@ private void registerFailure(final String urlhash, final String reason) { Log.logInfo("search", "sorted out hash " + urlhash + " during search: " + reason); } - public ArrayList getHostNavigator(int maxentries) { + public ArrayList getHostNavigator(int maxentries) { if (this.hostNavigator != null) return this.hostNavigator; if (localSearchThread != null && localSearchThread.isAlive()) { try {Thread.sleep(100L);} catch (final InterruptedException e) {} @@ -778,9 +777,9 @@ public void remove(final String urlhash) { //assert e != null; } - public Set references(final int count) { + public ArrayList topics(final int count) { // returns a set of words that are computed as toplist - return this.rankedCache.getReferences(count); + return this.rankedCache.getTopicNavigator(count); } public static class ResultEntry { diff --git a/source/de/anomic/plasma/plasmaSearchRankingProcess.java b/source/de/anomic/plasma/plasmaSearchRankingProcess.java index 0b20aeb729..050cc97936 100644 --- a/source/de/anomic/plasma/plasmaSearchRankingProcess.java +++ b/source/de/anomic/plasma/plasmaSearchRankingProcess.java @@ -48,7 +48,6 @@ import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow; import de.anomic.kelondro.text.referencePrototype.WordReference; import de.anomic.kelondro.text.referencePrototype.WordReferenceVars; -import de.anomic.kelondro.util.ScoreCluster; import de.anomic.kelondro.util.SortStack; import de.anomic.kelondro.util.FileUtils; import de.anomic.plasma.parser.Word; @@ -71,13 +70,13 @@ public final class plasmaSearchRankingProcess { private int remote_peerCount, remote_indexCount, remote_resourceSize, local_resourceSize; private final ReferenceOrder order; private final ConcurrentHashMap urlhashes; // map for double-check; String/Long relation, addresses ranking number (backreference for deletion) - private final ScoreCluster ref; // reference score computation for the commonSense heuristic private final int[] flagcount; // flag counter private final TreeSet misses; // contains url-hashes that could not been found in the LURL-DB private final Segment indexSegment; private HashMap>[] localSearchContainerMaps; private final int[] domZones; - private ConcurrentHashMap hostNavigator; + private final ConcurrentHashMap hostNavigator; + private final ConcurrentHashMap ref; // reference score computation for the commonSense heuristic public plasmaSearchRankingProcess( final Segment indexSegment, @@ -99,13 +98,13 @@ public plasmaSearchRankingProcess( this.remote_resourceSize = 0; this.local_resourceSize = 0; this.urlhashes = new ConcurrentHashMap(0, 0.75f, concurrency); - this.ref = new ScoreCluster(); this.misses = new TreeSet(); this.indexSegment = indexSegment; this.flagcount = new int[32]; for (int i = 0; i < 32; i++) {this.flagcount[i] = 0;} - this.domZones = new int[8]; this.hostNavigator = new ConcurrentHashMap(); + this.ref = new ConcurrentHashMap(); + this.domZones = new int[8]; for (int i = 0; i < 8; i++) {this.domZones[i] = 0;} } @@ -232,52 +231,6 @@ public void insertRanked(final ReferenceContainer index, final bo serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(query.id(true), plasmaSearchEvent.PRESORT, index.size(), System.currentTimeMillis() - timer), false); } - public class hoststat { - public int count; - public String hashsample; - public hoststat(String urlhash) { - this.count = 1; - this.hashsample = urlhash; - } - public void inc() { - this.count++; - } - } - - public static final Comparator hscomp = new Comparator() { - public int compare(hoststat o1, hoststat o2) { - if (o1.count < o2.count) return 1; - if (o2.count < o1.count) return -1; - return 0; - } - }; - - public class hostnaventry { - public int count; - public String host; - public hostnaventry(String host, int count) { - this.host = host; - this.count = count; - } - } - - public ArrayList getHostNavigator(int maxentries) { - hoststat[] hsa = this.hostNavigator.values().toArray(new hoststat[this.hostNavigator.size()]); - Arrays.sort(hsa, hscomp); - int rc = Math.min(maxentries, hsa.length); - ArrayList result = new ArrayList(); - URLMetadataRow mr; - yacyURL url; - for (int i = 0; i < rc; i++) { - mr = indexSegment.urlMetadata().load(hsa[i].hashsample, null, 0); - if (mr == null) continue; - url = mr.metadata().url(); - if (url == null) continue; - result.add(new hostnaventry(url.getHost(), hsa[i].count)); - } - return result; - } - private boolean testFlags(final WordReference ientry) { if (query.constraint == null) return true; // test if ientry matches with filter @@ -424,37 +377,103 @@ public Iterator miss() { return this.misses.iterator(); } - public Set getReferences(final int count) { + public class hoststat { + public int count; + public String hashsample; + public hoststat(String urlhash) { + this.count = 1; + this.hashsample = urlhash; + } + public void inc() { + this.count++; + } + } + + public static final Comparator hscomp = new Comparator() { + public int compare(hoststat o1, hoststat o2) { + if (o1.count < o2.count) return 1; + if (o2.count < o1.count) return -1; + return 0; + } + }; + + public class NavigatorEntry { + public int count; + public String name; + public NavigatorEntry(String name, int count) { + this.name = name; + this.count = count; + } + } + + public ArrayList getHostNavigator(int count) { + hoststat[] hsa = this.hostNavigator.values().toArray(new hoststat[this.hostNavigator.size()]); + Arrays.sort(hsa, hscomp); + int rc = Math.min(count, hsa.length); + ArrayList result = new ArrayList(); + URLMetadataRow mr; + yacyURL url; + for (int i = 0; i < rc; i++) { + mr = indexSegment.urlMetadata().load(hsa[i].hashsample, null, 0); + if (mr == null) continue; + url = mr.metadata().url(); + if (url == null) continue; + result.add(new NavigatorEntry(url.getHost(), hsa[i].count)); + } + return result; + } + + public static final Comparator> mecomp = new Comparator>() { + public int compare(Map.Entry o1, Map.Entry o2) { + if (o1.getValue().intValue() < o2.getValue().intValue()) return 1; + if (o2.getValue().intValue() < o1.getValue().intValue()) return -1; + return 0; + } + }; + + @SuppressWarnings("unchecked") + public ArrayList getTopicNavigator(final int count) { // create a list of words that had been computed by statistics over all // words that appeared in the url or the description of all urls - final Object[] refs = ref.getScores(count, false, 2, Integer.MAX_VALUE); - final TreeSet s = new TreeSet(String.CASE_INSENSITIVE_ORDER); - for (int i = 0; i < refs.length; i++) { - s.add((String) refs[i]); + + Map.Entry[] a = this.ref.entrySet().toArray(new Map.Entry[this.ref.size()]); + Arrays.sort(a, mecomp); + int rc = Math.min(count, a.length); + ArrayList result = new ArrayList(); + Map.Entry e; + int c; + for (int i = 0; i < rc; i++) { + e = a[i]; + c = e.getValue().intValue(); + if (c == 0) break; + result.add(new NavigatorEntry(e.getKey(), c)); } - return s; + return result; } - public void addReferences(final String[] words) { + public void addTopic(final String[] words) { String word; for (int i = 0; i < words.length; i++) { word = words[i].toLowerCase(); + Integer c; if ((word.length() > 2) && ("http_html_php_ftp_www_com_org_net_gov_edu_index_home_page_for_usage_the_and_".indexOf(word) < 0) && - (!(query.queryHashes.contains(Word.word2hash(word))))) - ref.incScore(word); + (!(query.queryHashes.contains(Word.word2hash(word))))) { + c = ref.get(word); + if (c == null) ref.put(word, 1); else ref.put(word, c.intValue() + 1); + } } } - protected void addReferences(final plasmaSearchEvent.ResultEntry resultEntry) { + protected void addTopics(final plasmaSearchEvent.ResultEntry resultEntry) { // take out relevant information for reference computation if ((resultEntry.url() == null) || (resultEntry.title() == null)) return; - final String[] urlcomps = htmlFilterContentScraper.urlComps(resultEntry.url().toNormalform(true, true)); // word components of the url + //final String[] urlcomps = htmlFilterContentScraper.urlComps(resultEntry.url().toNormalform(true, true)); // word components of the url final String[] descrcomps = resultEntry.title().toLowerCase().split(htmlFilterContentScraper.splitrex); // words in the description // add references - addReferences(urlcomps); - addReferences(descrcomps); + //addTopic(urlcomps); + addTopic(descrcomps); } public ReferenceOrder getOrder() { diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java index 3eaa91a63d..4cbbcabb6e 100644 --- a/source/de/anomic/yacy/yacyClient.java +++ b/source/de/anomic/yacy/yacyClient.java @@ -604,8 +604,8 @@ public static String[] search( yacyCore.log.logInfo("remote search (client): peer " + target.getName() + " sent references " + references); if (references != null) { // add references twice, so they can be countet (must have at least 2 entries) - containerCache.addReferences(references.split(",")); - containerCache.addReferences(references.split(",")); + containerCache.addTopic(references.split(",")); + containerCache.addTopic(references.split(",")); } }