Skip to content

Commit

Permalink
renamed topwords to topics and enhanced computation methods of topics
Browse files Browse the repository at this point in the history
topics will now only be computed using the document title, not the document url,
because the host navigator is now responsible for statistical effects of urls.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6011 6c8d7289-2bf4-0310-a012-ef5d649a1542
  • Loading branch information
orbiter committed Jun 2, 2009
1 parent 61d9e13 commit ab06a6e
Show file tree
Hide file tree
Showing 5 changed files with 104 additions and 91 deletions.
9 changes: 4 additions & 5 deletions htroot/yacy/search.java
Expand Up @@ -32,7 +32,6 @@
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;

import de.anomic.content.RSSMessage;
Expand All @@ -49,6 +48,7 @@
import de.anomic.plasma.plasmaSearchRankingProfile;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaSearchEvent.ResultEntry;
import de.anomic.plasma.plasmaSearchRankingProcess.NavigatorEntry;
import de.anomic.server.serverCore;
import de.anomic.server.serverObjects;
import de.anomic.server.serverProfiling;
Expand Down Expand Up @@ -321,11 +321,10 @@ public static serverObjects respond(final httpRequestHeader header, final server

// prepare reference hints
final long timer = System.currentTimeMillis();
final Set<String> ws = theSearch.references(10);
final ArrayList<NavigatorEntry> ws = theSearch.topics(10);
final StringBuilder refstr = new StringBuilder();
final Iterator<String> j = ws.iterator();
while (j.hasNext()) {
refstr.append(",").append(j.next());
for (NavigatorEntry e: ws) {
refstr.append(",").append(e.name);
}
prop.put("references", (refstr.length() > 0) ? refstr.substring(1) : refstr.toString());
serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(theQuery.id(true), "reference collection", ws.size(), System.currentTimeMillis() - timer), false);
Expand Down
22 changes: 9 additions & 13 deletions htroot/yacysearchtrailer.java
Expand Up @@ -26,7 +26,6 @@

import java.util.ArrayList;
import java.util.Iterator;
import java.util.Set;
import java.util.TreeSet;

import de.anomic.http.httpRequestHeader;
Expand All @@ -36,7 +35,7 @@
import de.anomic.plasma.plasmaSearchEvent;
import de.anomic.plasma.plasmaSearchQuery;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaSearchRankingProcess.hostnaventry;
import de.anomic.plasma.plasmaSearchRankingProcess.NavigatorEntry;
import de.anomic.server.serverObjects;
import de.anomic.server.serverProfiling;
import de.anomic.server.serverSwitch;
Expand Down Expand Up @@ -65,17 +64,17 @@ public static serverObjects respond(final httpRequestHeader header, final server


// compose search navigation
ArrayList<hostnaventry> hostNavigator = theSearch.getHostNavigator(10);
ArrayList<NavigatorEntry> hostNavigator = theSearch.getHostNavigator(10);
if (hostNavigator == null) {
prop.put("navigation", 0);
} else {
prop.put("navigation", 1);
hostnaventry entry;
NavigatorEntry entry;
int i;
for (i = 0; i < hostNavigator.size(); i++) {
entry = hostNavigator.get(i);
prop.put("navigation_domains_" + i + "_domain", plasmaSearchQuery.navurla(0, display, theQuery, theQuery.urlMask, "site:" + entry.host) + entry.host + " (" + entry.count + ")</a>");
prop.putJSON("navigation_domains_" + i + "_domain-json", plasmaSearchQuery.navurla(0, display, theQuery, theQuery.urlMask, "site:" + entry.host) + entry.host + " (" + entry.count + ")</a>");
prop.put("navigation_domains_" + i + "_domain", plasmaSearchQuery.navurla(0, display, theQuery, theQuery.urlMask, "site:" + entry.name) + entry.name + " (" + entry.count + ")</a>");
prop.putJSON("navigation_domains_" + i + "_domain-json", plasmaSearchQuery.navurla(0, display, theQuery, theQuery.urlMask, "site:" + entry.name) + entry.name + " (" + entry.count + ")</a>");
prop.put("navigation_domains_" + i + "_nl", 1);
}
i--;
Expand All @@ -84,16 +83,13 @@ public static serverObjects respond(final httpRequestHeader header, final server
}

// attach the bottom line with search references (topwords)
final Set<String> references = theSearch.references(20);
final ArrayList<NavigatorEntry> references = theSearch.topics(20);
if (references.size() > 0) {
// get the topwords
final TreeSet<String> topwords = new TreeSet<String>(NaturalOrder.naturalComparator);
String tmp = "";
final Iterator<String> i = references.iterator();
while (i.hasNext()) {
tmp = i.next();
if (tmp.matches("[a-z]+")) {
topwords.add(tmp);
for (NavigatorEntry e: references) {
if (e.name.matches("[a-z]+")) {
topwords.add(e.name);
}
}

Expand Down
13 changes: 6 additions & 7 deletions source/de/anomic/plasma/plasmaSearchEvent.java
Expand Up @@ -32,7 +32,6 @@
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.concurrent.ConcurrentHashMap;
Expand All @@ -53,7 +52,7 @@
import de.anomic.kelondro.util.Log;
import de.anomic.plasma.parser.Word;
import de.anomic.plasma.parser.Condenser;
import de.anomic.plasma.plasmaSearchRankingProcess.hostnaventry;
import de.anomic.plasma.plasmaSearchRankingProcess.NavigatorEntry;
import de.anomic.plasma.plasmaSnippetCache.MediaSnippet;
import de.anomic.server.serverProfiling;
import de.anomic.yacy.yacySearch;
Expand Down Expand Up @@ -99,7 +98,7 @@ public final class plasmaSearchEvent {
long urlRetrievalAllTime;
long snippetComputationAllTime;
public ResultURLs crawlResults;
private ArrayList<hostnaventry> hostNavigator;
private ArrayList<NavigatorEntry> hostNavigator;

@SuppressWarnings("unchecked")
private plasmaSearchEvent(final plasmaSearchQuery query,
Expand Down Expand Up @@ -559,7 +558,7 @@ public void run() {
// place the result to the result vector
if (!result.exists(resultEntry)) {
result.push(resultEntry, Long.valueOf(rankedCache.getOrder().cardinal(resultEntry.word())));
rankedCache.addReferences(resultEntry);
rankedCache.addTopics(resultEntry);
}
//System.out.println("DEBUG SNIPPET_LOADING: thread " + id + " got " + resultEntry.url());
}
Expand All @@ -579,7 +578,7 @@ private void registerFailure(final String urlhash, final String reason) {
Log.logInfo("search", "sorted out hash " + urlhash + " during search: " + reason);
}

public ArrayList<hostnaventry> getHostNavigator(int maxentries) {
public ArrayList<NavigatorEntry> getHostNavigator(int maxentries) {
if (this.hostNavigator != null) return this.hostNavigator;
if (localSearchThread != null && localSearchThread.isAlive()) {
try {Thread.sleep(100L);} catch (final InterruptedException e) {}
Expand Down Expand Up @@ -778,9 +777,9 @@ public void remove(final String urlhash) {
//assert e != null;
}

public Set<String> references(final int count) {
public ArrayList<NavigatorEntry> topics(final int count) {
// returns a set of words that are computed as toplist
return this.rankedCache.getReferences(count);
return this.rankedCache.getTopicNavigator(count);
}

public static class ResultEntry {
Expand Down
147 changes: 83 additions & 64 deletions source/de/anomic/plasma/plasmaSearchRankingProcess.java
Expand Up @@ -48,7 +48,6 @@
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
import de.anomic.kelondro.text.referencePrototype.WordReference;
import de.anomic.kelondro.text.referencePrototype.WordReferenceVars;
import de.anomic.kelondro.util.ScoreCluster;
import de.anomic.kelondro.util.SortStack;
import de.anomic.kelondro.util.FileUtils;
import de.anomic.plasma.parser.Word;
Expand All @@ -71,13 +70,13 @@ public final class plasmaSearchRankingProcess {
private int remote_peerCount, remote_indexCount, remote_resourceSize, local_resourceSize;
private final ReferenceOrder order;
private final ConcurrentHashMap<String, Integer> urlhashes; // map for double-check; String/Long relation, addresses ranking number (backreference for deletion)
private final ScoreCluster<String> ref; // reference score computation for the commonSense heuristic
private final int[] flagcount; // flag counter
private final TreeSet<String> misses; // contains url-hashes that could not been found in the LURL-DB
private final Segment indexSegment;
private HashMap<byte[], ReferenceContainer<WordReference>>[] localSearchContainerMaps;
private final int[] domZones;
private ConcurrentHashMap<String, hoststat> hostNavigator;
private final ConcurrentHashMap<String, hoststat> hostNavigator;
private final ConcurrentHashMap<String, Integer> ref; // reference score computation for the commonSense heuristic

public plasmaSearchRankingProcess(
final Segment indexSegment,
Expand All @@ -99,13 +98,13 @@ public plasmaSearchRankingProcess(
this.remote_resourceSize = 0;
this.local_resourceSize = 0;
this.urlhashes = new ConcurrentHashMap<String, Integer>(0, 0.75f, concurrency);
this.ref = new ScoreCluster<String>();
this.misses = new TreeSet<String>();
this.indexSegment = indexSegment;
this.flagcount = new int[32];
for (int i = 0; i < 32; i++) {this.flagcount[i] = 0;}
this.domZones = new int[8];
this.hostNavigator = new ConcurrentHashMap<String, hoststat>();
this.ref = new ConcurrentHashMap<String, Integer>();
this.domZones = new int[8];
for (int i = 0; i < 8; i++) {this.domZones[i] = 0;}
}

Expand Down Expand Up @@ -232,52 +231,6 @@ public void insertRanked(final ReferenceContainer<WordReference> index, final bo
serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(query.id(true), plasmaSearchEvent.PRESORT, index.size(), System.currentTimeMillis() - timer), false);
}

public class hoststat {
public int count;
public String hashsample;
public hoststat(String urlhash) {
this.count = 1;
this.hashsample = urlhash;
}
public void inc() {
this.count++;
}
}

public static final Comparator<hoststat> hscomp = new Comparator<hoststat>() {
public int compare(hoststat o1, hoststat o2) {
if (o1.count < o2.count) return 1;
if (o2.count < o1.count) return -1;
return 0;
}
};

public class hostnaventry {
public int count;
public String host;
public hostnaventry(String host, int count) {
this.host = host;
this.count = count;
}
}

public ArrayList<hostnaventry> getHostNavigator(int maxentries) {
hoststat[] hsa = this.hostNavigator.values().toArray(new hoststat[this.hostNavigator.size()]);
Arrays.sort(hsa, hscomp);
int rc = Math.min(maxentries, hsa.length);
ArrayList<hostnaventry> result = new ArrayList<hostnaventry>();
URLMetadataRow mr;
yacyURL url;
for (int i = 0; i < rc; i++) {
mr = indexSegment.urlMetadata().load(hsa[i].hashsample, null, 0);
if (mr == null) continue;
url = mr.metadata().url();
if (url == null) continue;
result.add(new hostnaventry(url.getHost(), hsa[i].count));
}
return result;
}

private boolean testFlags(final WordReference ientry) {
if (query.constraint == null) return true;
// test if ientry matches with filter
Expand Down Expand Up @@ -424,37 +377,103 @@ public Iterator<String> miss() {
return this.misses.iterator();
}

public Set<String> getReferences(final int count) {
public class hoststat {
public int count;
public String hashsample;
public hoststat(String urlhash) {
this.count = 1;
this.hashsample = urlhash;
}
public void inc() {
this.count++;
}
}

public static final Comparator<hoststat> hscomp = new Comparator<hoststat>() {
public int compare(hoststat o1, hoststat o2) {
if (o1.count < o2.count) return 1;
if (o2.count < o1.count) return -1;
return 0;
}
};

public class NavigatorEntry {
public int count;
public String name;
public NavigatorEntry(String name, int count) {
this.name = name;
this.count = count;
}
}

public ArrayList<NavigatorEntry> getHostNavigator(int count) {
hoststat[] hsa = this.hostNavigator.values().toArray(new hoststat[this.hostNavigator.size()]);
Arrays.sort(hsa, hscomp);
int rc = Math.min(count, hsa.length);
ArrayList<NavigatorEntry> result = new ArrayList<NavigatorEntry>();
URLMetadataRow mr;
yacyURL url;
for (int i = 0; i < rc; i++) {
mr = indexSegment.urlMetadata().load(hsa[i].hashsample, null, 0);
if (mr == null) continue;
url = mr.metadata().url();
if (url == null) continue;
result.add(new NavigatorEntry(url.getHost(), hsa[i].count));
}
return result;
}

public static final Comparator<Map.Entry<String, Integer>> mecomp = new Comparator<Map.Entry<String, Integer>>() {
public int compare(Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2) {
if (o1.getValue().intValue() < o2.getValue().intValue()) return 1;
if (o2.getValue().intValue() < o1.getValue().intValue()) return -1;
return 0;
}
};

@SuppressWarnings("unchecked")
public ArrayList<NavigatorEntry> getTopicNavigator(final int count) {
// create a list of words that had been computed by statistics over all
// words that appeared in the url or the description of all urls
final Object[] refs = ref.getScores(count, false, 2, Integer.MAX_VALUE);
final TreeSet<String> s = new TreeSet<String>(String.CASE_INSENSITIVE_ORDER);
for (int i = 0; i < refs.length; i++) {
s.add((String) refs[i]);

Map.Entry<String, Integer>[] a = this.ref.entrySet().toArray(new Map.Entry[this.ref.size()]);
Arrays.sort(a, mecomp);
int rc = Math.min(count, a.length);
ArrayList<NavigatorEntry> result = new ArrayList<NavigatorEntry>();
Map.Entry<String, Integer> e;
int c;
for (int i = 0; i < rc; i++) {
e = a[i];
c = e.getValue().intValue();
if (c == 0) break;
result.add(new NavigatorEntry(e.getKey(), c));
}
return s;
return result;
}

public void addReferences(final String[] words) {
public void addTopic(final String[] words) {
String word;
for (int i = 0; i < words.length; i++) {
word = words[i].toLowerCase();
Integer c;
if ((word.length() > 2) &&
("http_html_php_ftp_www_com_org_net_gov_edu_index_home_page_for_usage_the_and_".indexOf(word) < 0) &&
(!(query.queryHashes.contains(Word.word2hash(word)))))
ref.incScore(word);
(!(query.queryHashes.contains(Word.word2hash(word))))) {
c = ref.get(word);
if (c == null) ref.put(word, 1); else ref.put(word, c.intValue() + 1);
}
}
}

protected void addReferences(final plasmaSearchEvent.ResultEntry resultEntry) {
protected void addTopics(final plasmaSearchEvent.ResultEntry resultEntry) {
// take out relevant information for reference computation
if ((resultEntry.url() == null) || (resultEntry.title() == null)) return;
final String[] urlcomps = htmlFilterContentScraper.urlComps(resultEntry.url().toNormalform(true, true)); // word components of the url
//final String[] urlcomps = htmlFilterContentScraper.urlComps(resultEntry.url().toNormalform(true, true)); // word components of the url
final String[] descrcomps = resultEntry.title().toLowerCase().split(htmlFilterContentScraper.splitrex); // words in the description

// add references
addReferences(urlcomps);
addReferences(descrcomps);
//addTopic(urlcomps);
addTopic(descrcomps);
}

public ReferenceOrder getOrder() {
Expand Down
4 changes: 2 additions & 2 deletions source/de/anomic/yacy/yacyClient.java
Expand Up @@ -604,8 +604,8 @@ public static String[] search(
yacyCore.log.logInfo("remote search (client): peer " + target.getName() + " sent references " + references);
if (references != null) {
// add references twice, so they can be countet (must have at least 2 entries)
containerCache.addReferences(references.split(","));
containerCache.addReferences(references.split(","));
containerCache.addTopic(references.split(","));
containerCache.addTopic(references.split(","));
}
}

Expand Down

0 comments on commit ab06a6e

Please sign in to comment.