Skip to content

Commit

Permalink
changes towards better join-search
Browse files Browse the repository at this point in the history
- added generation of a compressed index within remote peers during global search
- added selection of specific urls within remote peers during secondary global search


git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2539 6c8d7289-2bf4-0310-a012-ef5d649a1542
  • Loading branch information
orbiter committed Sep 10, 2006
1 parent 4a49446 commit 74d1dea
Show file tree
Hide file tree
Showing 14 changed files with 156 additions and 53 deletions.
6 changes: 3 additions & 3 deletions htroot/IndexControl_p.java
Expand Up @@ -149,7 +149,7 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
if (delurl || delurlref) {
// generate an urlx array
indexContainer index = null;
index = switchboard.wordIndex.getContainer(keyhash, true, -1);
index = switchboard.wordIndex.getContainer(keyhash, null, true, -1);
Iterator en = index.entries();
int i = 0;
urlx = new String[index.size()];
Expand Down Expand Up @@ -252,7 +252,7 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
indexContainer index;
String result;
long starttime = System.currentTimeMillis();
index = switchboard.wordIndex.getContainer(keyhash, true, -1);
index = switchboard.wordIndex.getContainer(keyhash, null, true, -1);
// built urlCache
Iterator urlIter = index.entries();
HashMap knownURLs = new HashMap();
Expand Down Expand Up @@ -424,7 +424,7 @@ public static String genUrlList(plasmaSwitchboard switchboard, String keyhash, S
// search for a word hash and generate a list of url links
indexContainer index = null;
try {
index = switchboard.wordIndex.getContainer(keyhash, true, -1);
index = switchboard.wordIndex.getContainer(keyhash, null, true, -1);

final StringBuffer result = new StringBuffer(1024);
if (index.size() == 0) {
Expand Down
3 changes: 2 additions & 1 deletion htroot/yacy/search.html
Expand Up @@ -8,4 +8,5 @@
joincount=#[joincount]#
count=#[linkcount]#
#[links]#
#[indexcount]#
#[indexcount]#
#[indexabstract]#
52 changes: 42 additions & 10 deletions htroot/yacy/search.java
Expand Up @@ -49,6 +49,7 @@

import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;

import de.anomic.http.httpHeader;
Expand Down Expand Up @@ -81,7 +82,8 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
final String oseed = post.get("myseed", ""); // complete seed of the requesting peer
// final String youare = post.get("youare", ""); // seed hash of the target peer, used for testing network stability
final String key = post.get("key", ""); // transmission key for response
final String query = post.get("query", ""); // a string of word hashes
final String query = post.get("query", ""); // a string of word hashes that shall be searched and combined
final String urls = post.get("urls", ""); // a string of url hashes that are preselected for the search: no other may be returned
// final String fwdep = post.get("fwdep", ""); // forward depth. if "0" then peer may NOT ask another peer for more results
// final String fwden = post.get("fwden", ""); // forward deny, a list of seed hashes. They may NOT be target of forward hopping
final long duetime= post.getLong("duetime", 3000);
Expand Down Expand Up @@ -117,34 +119,64 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve

yacyCore.log.logInfo("INIT HASH SEARCH: " + squery.queryHashes + " - " + squery.wantedResults + " links");
long timestamp1 = System.currentTimeMillis();

// prepare a search profile
plasmaSearchRankingProfile rankingProfile = new plasmaSearchRankingProfile(new String[]{plasmaSearchRankingProfile.ORDER_YBR, plasmaSearchRankingProfile.ORDER_DATE, plasmaSearchRankingProfile.ORDER_QUALITY});
plasmaSearchTimingProfile localTiming = new plasmaSearchTimingProfile(squery.maximumTime, squery.wantedResults);
plasmaSearchTimingProfile remoteTiming = null;
plasmaSearchEvent theSearch = new plasmaSearchEvent(squery, rankingProfile, localTiming, remoteTiming, true, yacyCore.log, sb.wordIndex, sb.urlPool.loadedURL, sb.snippetCache);
Set containers = theSearch.localSearchContainers();
indexContainer localResults = theSearch.localSearchJoin(containers);
int joincount = localResults.size();
plasmaSearchResult acc = theSearch.order(localResults);

// set statistic details of search result
prop.put("joincount", Integer.toString(joincount));
// retrieve index containers from search request
plasmaSearchEvent theSearch = new plasmaSearchEvent(squery, rankingProfile, localTiming, remoteTiming, true, yacyCore.log, sb.wordIndex, sb.urlPool.loadedURL, sb.snippetCache);
Set urlselection = null;
if ((urls.length() > 0) && (urls.length() % 12 == 0)) {
for (int i = 0; i < (urls.length() / 12); i++) urlselection.add(urls.substring(i * 12, (i + 1 * 12)));
}
Map containers = theSearch.localSearchContainers(urlselection);

// set statistic details of search result and find best result index set
String maxcounthash = null;
if (containers == null) {
prop.put("indexcount", "");
} else {
Iterator ci = containers.iterator();
Iterator ci = containers.entrySet().iterator();
StringBuffer indexcount = new StringBuffer();
Map.Entry entry;
String wordhash;
int maxcount = -1;
while (ci.hasNext()) {
indexContainer container = (indexContainer) ci.next();
entry = (Map.Entry) ci.next();
wordhash = (String) entry.getKey();
indexContainer container = (indexContainer) entry.getValue();
if (container.size() > maxcount) maxcounthash = wordhash;
indexcount.append("indexcount.").append(container.getWordHash()).append('=').append(Integer.toString(container.size())).append(serverCore.crlfString);
}
prop.put("indexcount", new String(indexcount));
}

// generate compressed index for maxcounthash
// this is not needed if the search is restricted to specific urls, because it is a re-search
if ((maxcounthash == null) || (urls.length() != 0)) {
prop.put("indexabstract","");
} else {
String indexabstract = "indexabstract." + maxcounthash + "=" + ((indexContainer) containers.get(maxcounthash)).compressedIndex(1000);
yacyCore.log.logFine("DEBUG HASH SEARCH: " + indexabstract);
prop.put("indexabstract", indexabstract);
}

// join and order the result
indexContainer localResults = theSearch.localSearchJoin(containers.values());
int joincount = localResults.size();
prop.put("joincount", Integer.toString(joincount));
plasmaSearchResult acc = theSearch.order(localResults);

// prepare result
if ((joincount == 0) || (acc == null)) {

// no results
prop.put("links", "");
prop.put("linkcount", "0");
prop.put("references", "");

} else {

// result is a List of urlEntry elements
Expand Down
2 changes: 1 addition & 1 deletion source/de/anomic/index/indexAbstractRI.java
Expand Up @@ -36,7 +36,7 @@ public indexContainer addEntry(String wordHash, indexEntry newEntry, long update
}

public long getUpdateTime(String wordHash) {
indexContainer entries = getContainer(wordHash, false, -1);
indexContainer entries = getContainer(wordHash, null, false, -1);
if (entries == null) return 0;
return entries.updated();
}
Expand Down
5 changes: 3 additions & 2 deletions source/de/anomic/index/indexCollectionRI.java
Expand Up @@ -108,10 +108,11 @@ public void remove() {

}

public indexContainer getContainer(String wordHash, boolean deleteIfEmpty, long maxtime) {
public indexContainer getContainer(String wordHash, Set urlselection, boolean deleteIfEmpty, long maxtime) {
try {
kelondroRowSet collection = collectionIndex.get(wordHash.getBytes(), deleteIfEmpty);
if (collection == null) return null;
collection.select(urlselection);
if ((collection == null) || (collection.size() == 0)) return null;
return new indexRowSetContainer(wordHash, collection);
} catch (IOException e) {
return null;
Expand Down
5 changes: 4 additions & 1 deletion source/de/anomic/index/indexContainer.java
Expand Up @@ -32,6 +32,7 @@
import java.util.Set;

import de.anomic.kelondro.kelondroOrder;
import de.anomic.server.serverByteBuffer;

public interface indexContainer {

Expand All @@ -43,7 +44,9 @@ public interface indexContainer {

public void setWordHash(String newWordHash);
public String getWordHash();

public serverByteBuffer compressedIndex(long maxtime);
public void select(Set urlselection);

public void setOrdering(kelondroOrder newOrder, int newColumn);
public kelondroOrder order();
public int orderColumn();
Expand Down
10 changes: 8 additions & 2 deletions source/de/anomic/index/indexRAMCacheRI.java
Expand Up @@ -386,8 +386,14 @@ private long longEmit(int intTime) {
return (((long) intTime) * (long) 1000) + initTime;
}

public indexContainer getContainer(String wordHash, boolean deleteIfEmpty, long maxtime_dummy) {
return (indexContainer) wCache.get(wordHash);
public indexContainer getContainer(String wordHash, Set urlselection, boolean deleteIfEmpty, long maxtime_dummy) {
if (urlselection == null) {
return (indexContainer) wCache.get(wordHash);
} else {
indexContainer ic = ((indexContainer) wCache.get(wordHash)).topLevelClone();
ic.select(urlselection);
return ic;
}
}

public indexContainer deleteContainer(String wordHash) {
Expand Down
2 changes: 1 addition & 1 deletion source/de/anomic/index/indexRI.java
Expand Up @@ -53,7 +53,7 @@ public interface indexRI {

public long getUpdateTime(String wordHash);

public indexContainer getContainer(String wordHash, boolean deleteIfEmpty, long maxtime);
public indexContainer getContainer(String wordHash, Set urlselection, boolean deleteIfEmpty, long maxtime);
public indexContainer deleteContainer(String wordHash);

public boolean removeEntry(String wordHash, String urlHash, boolean deleteComplete);
Expand Down
51 changes: 47 additions & 4 deletions source/de/anomic/index/indexRowSetContainer.java
Expand Up @@ -27,16 +27,19 @@
package de.anomic.index;

import java.lang.reflect.Method;
import java.util.Collection;
import java.util.ConcurrentModificationException;
import java.util.Iterator;
import java.util.Set;
import java.util.Map;
import java.util.TreeMap;

import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroNaturalOrder;
import de.anomic.kelondro.kelondroOrder;
import de.anomic.kelondro.kelondroRow;
import de.anomic.kelondro.kelondroRowSet;
import de.anomic.server.serverByteBuffer;

public class indexRowSetContainer extends kelondroRowSet implements indexContainer {

Expand Down Expand Up @@ -64,6 +67,43 @@ public indexContainer topLevelClone() {
return newContainer;
}

public serverByteBuffer compressedIndex(long maxtime) {
// collect references according to domains
long timeout = (maxtime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxtime;
TreeMap doms = new TreeMap();
synchronized(this) {
Iterator i = entries();
indexEntry iEntry;
String dom, paths;
while (i.hasNext()) {
iEntry = (indexEntry) i.next();
dom = iEntry.urlHash().substring(6);
if ((paths = (String) doms.get(dom)) == null) {
doms.put(dom, iEntry.urlHash().substring(0, 6));
} else {
doms.put(dom, paths + iEntry.urlHash().substring(0, 6));
}
if (System.currentTimeMillis() > timeout) break;
}
}
// construct a result string
serverByteBuffer bb = new serverByteBuffer(this.size() * indexURLEntry.urlEntryRow.width(0) / 2);
bb.append('{');
Iterator i = doms.entrySet().iterator();
Map.Entry entry;
while (i.hasNext()) {
entry = (Map.Entry) i.next();
bb.append((String) entry.getKey());
bb.append(':');
bb.append((String) entry.getValue());
if (System.currentTimeMillis() > timeout) break;
if (i.hasNext()) bb.append(',');
}
bb.append('}');
bb.trim();
return bb;
}

public void setWordHash(String newWordHash) {
this.wordHash = newWordHash;
}
Expand Down Expand Up @@ -94,15 +134,18 @@ public int add(indexEntry[] entries, long updateTime) {

public int add(indexContainer c, long maxTime) {
// returns the number of new elements
long startTime = System.currentTimeMillis();
long timeout = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime;
if (c == null) return 0;
int x = 0;
synchronized (c) {
Iterator i = c.entries();
while ((i.hasNext()) && ((maxTime < 0) || ((startTime + maxTime) > System.currentTimeMillis()))) {
while (i.hasNext()) {
try {
if (addi((indexEntry) i.next())) x++;
} catch (ConcurrentModificationException e) {}
} catch (ConcurrentModificationException e) {
e.printStackTrace();
}
if (System.currentTimeMillis() > timeout) break;
}
}
this.lastTimeWrote = java.lang.Math.max(this.lastTimeWrote, c.updated());
Expand Down Expand Up @@ -202,7 +245,7 @@ public static Object containerMerge(Object a, Object b) {
return c;
}

public static indexContainer joinContainer(Set containers, long time, int maxDistance) {
public static indexContainer joinContainer(Collection containers, long time, int maxDistance) {

long stamp = System.currentTimeMillis();

Expand Down
13 changes: 13 additions & 0 deletions source/de/anomic/kelondro/kelondroRowCollection.java
Expand Up @@ -25,6 +25,7 @@
package de.anomic.kelondro;

import java.util.Iterator;
import java.util.Set;

public class kelondroRowCollection {

Expand Down Expand Up @@ -293,6 +294,18 @@ public void remove() {
}
}

public void select(Set keys) {
// removes all entries but the ones given by urlselection
if (keys == null) return;
synchronized (this) {
Iterator i = rows();
kelondroRow.Entry row;
while (i.hasNext()) {
row = (kelondroRow.Entry) i.next();
if (!(keys.contains(row.getColString(0, null)))) i.remove();
}
}
}

protected final void sort(kelondroOrder newOrder, int newColumn) {
if ((this.sortOrder == null) ||
Expand Down
15 changes: 9 additions & 6 deletions source/de/anomic/plasma/plasmaSearchEvent.java
Expand Up @@ -42,9 +42,11 @@

package de.anomic.plasma;

import java.util.Collection;
import java.util.Iterator;
import java.util.Set;
import java.util.Map;
import java.util.HashSet;
import java.util.Set;

import de.anomic.kelondro.kelondroException;
import de.anomic.server.logging.serverLog;
Expand Down Expand Up @@ -131,7 +133,7 @@ public plasmaSearchResult search() {
searchThreads = yacySearch.searchHashes(query.queryHashes, query.prefer, query.urlMask, query.maxDistance, urlStore, rcGlobal, fetchpeers, plasmaSwitchboard.urlBlacklist, snippetCache, profileGlobal, ranking);

// meanwhile do a local search
indexContainer rcLocal = localSearchJoin(localSearchContainers());
indexContainer rcLocal = localSearchJoin(localSearchContainers(null).values());
plasmaSearchResult localResult = orderLocal(rcLocal, timeout);

// catch up global results:
Expand Down Expand Up @@ -161,7 +163,7 @@ public plasmaSearchResult search() {
lastEvent = this;
return result;
} else {
indexContainer rcLocal = localSearchJoin(localSearchContainers());
indexContainer rcLocal = localSearchJoin(localSearchContainers(null).values());
plasmaSearchResult result = order(rcLocal);
result.localContributions = rcLocal.size();

Expand All @@ -173,13 +175,14 @@ public plasmaSearchResult search() {
}
}

public Set localSearchContainers() {
public Map localSearchContainers(Set urlselection) {
// search for the set of hashes and return the set of containers containing the seach result

// retrieve entities that belong to the hashes
profileLocal.startTimer();
Set containers = wordIndex.getContainers(
Map containers = wordIndex.getContainers(
query.queryHashes,
urlselection,
true,
true,
profileLocal.getTargetTime(plasmaSearchTimingProfile.PROCESS_COLLECTION));
Expand All @@ -190,7 +193,7 @@ public Set localSearchContainers() {
return containers;
}

public indexContainer localSearchJoin(Set containers) {
public indexContainer localSearchJoin(Collection containers) {
// join a search result and return the joincount (number of pages after join)

// since this is a conjunction we return an empty entity if any word is not known
Expand Down

0 comments on commit 74d1dea

Please sign in to comment.