Skip to content

Commit

Permalink
redesign of the SortStack and SortStore classes:
Browse files Browse the repository at this point in the history
created a WeakPriorityBlockingQueue as special implementation
of a PriorityBlockingQueue with a weak object binding.
- better abstraction of ordering technique
- fixed some bugs according to result numbering (distinguish different counters in Queue)
- fixed a ordering bug in post-ranking (ordering was decreased instead of increased)
- reversed ordering numbering using a reversed ordering. The higher the ranking number the better (now).

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7128 6c8d7289-2bf4-0310-a012-ef5d649a1542
  • Loading branch information
orbiter committed Sep 9, 2010
1 parent 03eb021 commit 348dece
Show file tree
Hide file tree
Showing 12 changed files with 433 additions and 471 deletions.
8 changes: 4 additions & 4 deletions htroot/yacy/search.java
Original file line number Diff line number Diff line change
Expand Up @@ -38,14 +38,14 @@
import net.yacy.cora.document.RSSMessage;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.storage.WeakPriorityBlockingQueue.ReverseElement;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.word.WordReference;
import net.yacy.kelondro.data.word.WordReferenceRow;
import net.yacy.kelondro.index.HandleSet;
import net.yacy.kelondro.order.Bitfield;
import net.yacy.kelondro.rwi.ReferenceContainer;
import net.yacy.kelondro.util.EventTracker;
import net.yacy.kelondro.util.SortStack;
import net.yacy.kelondro.util.ISO639;

import de.anomic.crawler.CrawlProfile;
Expand Down Expand Up @@ -192,7 +192,7 @@ public static serverObjects respond(final RequestHeader header, final serverObje
int joincount = 0;
QueryParams theQuery = null;
SearchEvent theSearch = null;
ArrayList<SortStack<ResultEntry>.stackElement> accu = null;
ArrayList<ReverseElement<ResultEntry>> accu = null;
if ((query.length() == 0) && (abstractSet != null)) {
// this is _not_ a normal search, only a request for index abstracts
Segment indexSegment = sb.indexSegments.segment(Segments.Process.PUBLIC);
Expand Down Expand Up @@ -362,10 +362,10 @@ public static serverObjects respond(final RequestHeader header, final serverObje
final long timer = System.currentTimeMillis();
final StringBuilder links = new StringBuilder(6000);
String resource = null;
SortStack<ResultEntry>.stackElement entry;
ReverseElement<ResultEntry> entry;
for (int i = 0; i < accu.size(); i++) {
entry = accu.get(i);
resource = entry.element.resource();
resource = entry.getElement().resource();
if (resource != null) {
links.append("resource").append(i).append('=').append(resource).append(serverCore.CRLF_STRING);
}
Expand Down
2 changes: 1 addition & 1 deletion source/de/anomic/crawler/SitemapImporter.java
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ public void process(sitemapParser.SitemapEntry entry) {
url,
null, // this.siteMapURL.toString(),
entry.url(),
new Date(),
entry.lastmod(new Date()),
this.crawlingProfile.handle(),
0,
0,
Expand Down
8 changes: 4 additions & 4 deletions source/de/anomic/search/MediaSnippet.java
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,11 @@ public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaS
public ContentDomain type;
public DigestURI href, source;
public String name, attr, mime;
public int ranking;
public long ranking;
public int width, height;
public long fileSize;

public MediaSnippet(final ContentDomain type, final DigestURI href, final String mime, final String name, final long fileSize, final String attr, final int ranking, final DigestURI source) {
public MediaSnippet(final ContentDomain type, final DigestURI href, final String mime, final String name, final long fileSize, final String attr, final long ranking, final DigestURI source) {
this.type = type;
this.href = href;
this.mime = mime;
Expand All @@ -73,7 +73,7 @@ public MediaSnippet(final ContentDomain type, final DigestURI href, final String
if ((this.attr == null) || (this.attr.length() == 0)) this.attr = "_";
}

public MediaSnippet(final ContentDomain type, final DigestURI href, final String mime, final String name, final long fileSize, final int width, final int height, final int ranking, final DigestURI source) {
public MediaSnippet(final ContentDomain type, final DigestURI href, final String mime, final String name, final long fileSize, final int width, final int height, final long ranking, final DigestURI source) {
this.type = type;
this.href = href;
this.mime = mime;
Expand Down Expand Up @@ -188,7 +188,7 @@ public static ArrayList<MediaSnippet> computeImageSnippets(final DigestURI sourc
int appcount = queryhashes.size() * 2 -
TextSnippet.removeAppearanceHashes(url.toNormalform(false, false), queryhashes).size() -
TextSnippet.removeAppearanceHashes(desc, queryhashes).size();
final int ranking = Integer.MAX_VALUE - (ientry.height() + 1) * (ientry.width() + 1) * (appcount + 1);
final long ranking = Long.MAX_VALUE - (ientry.height() + 1) * (ientry.width() + 1) * (appcount + 1);
result.add(new MediaSnippet(ContentDomain.IMAGE, url, MimeTable.url2mime(url), desc, ientry.fileSize(), ientry.width(), ientry.height(), ranking, source));
}
return result;
Expand Down
2 changes: 1 addition & 1 deletion source/de/anomic/search/MetadataRepository.java
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ public void clear() throws IOException {
}

public int size() {
return urlIndexFile.size();
return urlIndexFile == null ? 0 : urlIndexFile.size();
}

public void close() {
Expand Down
79 changes: 31 additions & 48 deletions source/de/anomic/search/RankingProcess.java
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@
import java.util.concurrent.TimeUnit;

import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.storage.WeakPriorityBlockingQueue;
import net.yacy.cora.storage.WeakPriorityBlockingQueue.ReverseElement;
import net.yacy.document.Condenser;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
Expand All @@ -56,7 +58,6 @@
import net.yacy.kelondro.rwi.TermSearch;
import net.yacy.kelondro.util.EventTracker;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.SortStack;

import de.anomic.yacy.graphics.ProfilingGraph;

Expand All @@ -77,9 +78,9 @@ public final class RankingProcess extends Thread {

private int remote_resourceSize, remote_indexCount, remote_peerCount;
private int local_resourceSize, local_indexCount;
private final SortStack<WordReferenceVars> stack;
private final WeakPriorityBlockingQueue<ReverseElement<WordReferenceVars>> stack;
private int feeders;
private final ConcurrentHashMap<String, SortStack<WordReferenceVars>> doubleDomCache; // key = domhash (6 bytes); value = like stack
private final ConcurrentHashMap<String, WeakPriorityBlockingQueue<ReverseElement<WordReferenceVars>>> doubleDomCache; // key = domhash (6 bytes); value = like stack
//private final HandleSet handover; // key = urlhash; used for double-check of urls that had been handed over to search process

private final Navigator ref; // reference score computation for the commonSense heuristic
Expand All @@ -93,8 +94,8 @@ public RankingProcess(final QueryParams query, final ReferenceOrder order, final
// attention: if minEntries is too high, this method will not terminate within the maxTime
// sortorder: 0 = hash, 1 = url, 2 = ranking
this.localSearchInclusion = null;
this.stack = new SortStack<WordReferenceVars>(maxentries, true);
this.doubleDomCache = new ConcurrentHashMap<String, SortStack<WordReferenceVars>>();
this.stack = new WeakPriorityBlockingQueue<ReverseElement<WordReferenceVars>>(maxentries);
this.doubleDomCache = new ConcurrentHashMap<String, WeakPriorityBlockingQueue<ReverseElement<WordReferenceVars>>>();
//this.handover = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.getOrdering(), 0);
this.query = query;
this.order = order;
Expand Down Expand Up @@ -245,17 +246,7 @@ public void add(final ReferenceContainer<WordReference> index, final boolean loc
if (urlhashes.has(fEntry.metadataHash())) continue;

// insert
if (maxentries < 0 || stack.size() < maxentries) {
// in case that we don't have enough yet, accept any new entry
stack.push(fEntry, r);
} else {
// if we already have enough entries, insert only such that are necessary to get a better result
if (stack.bottom(r.longValue())) continue;

// take the entry. the stack is automatically reduced
// to the maximum size by deletion of elements at the bottom
stack.push(fEntry, r);
}
stack.put(new ReverseElement<WordReferenceVars>(fEntry, r)); // inserts the element and removed the worst (which is smallest)
try {
urlhashes.put(fEntry.metadataHash());
} catch (RowSpaceExceededException e) {
Expand Down Expand Up @@ -314,32 +305,30 @@ protected Map<byte[], ReferenceContainer<WordReference>> searchContainerMap() {
// - root-domain guessing to prefer the root domain over other urls if search word appears in domain name


private SortStack<WordReferenceVars>.stackElement takeRWI(final boolean skipDoubleDom) {
private ReverseElement<WordReferenceVars> takeRWI(final boolean skipDoubleDom) {
// returns from the current RWI list the best entry and removes this entry from the list
SortStack<WordReferenceVars> m;
SortStack<WordReferenceVars>.stackElement rwi;
while (!stack.isEmpty()) {
rwi = stack.pop();
if (rwi == null) continue; // in case that a synchronization problem occurred just go lazy over it
WeakPriorityBlockingQueue<ReverseElement<WordReferenceVars>> m;
ReverseElement<WordReferenceVars> rwi;
while ((rwi = stack.poll()) != null) {
if (!skipDoubleDom) return rwi;
// check doubledom
final String domhash = new String(rwi.element.metadataHash()).substring(6);
final String domhash = new String(rwi.getElement().metadataHash()).substring(6);
m = this.doubleDomCache.get(domhash);
if (m == null) {
// first appearance of dom
m = new SortStack<WordReferenceVars>((query.specialRights) ? maxDoubleDomSpecial : maxDoubleDomAll, true);
m = new WeakPriorityBlockingQueue<ReverseElement<WordReferenceVars>>((query.specialRights) ? maxDoubleDomSpecial : maxDoubleDomAll);
this.doubleDomCache.put(domhash, m);
return rwi;
}
// second appearances of dom
m.push(rwi.element, rwi.weight);
m.put(rwi);
}
// no more entries in sorted RWI entries. Now take Elements from the doubleDomCache
// find best entry from all caches
SortStack<WordReferenceVars>.stackElement bestEntry = null;
SortStack<WordReferenceVars>.stackElement o;
ReverseElement<WordReferenceVars> bestEntry = null;
ReverseElement<WordReferenceVars> o;
synchronized (this.doubleDomCache) {
final Iterator<SortStack<WordReferenceVars>> i = this.doubleDomCache.values().iterator();
final Iterator<WeakPriorityBlockingQueue<ReverseElement<WordReferenceVars>>> i = this.doubleDomCache.values().iterator();
while (i.hasNext()) {
try {
m = i.next();
Expand All @@ -350,19 +339,19 @@ private SortStack<WordReferenceVars>.stackElement takeRWI(final boolean skipDoub
if (m == null) continue;
if (m.isEmpty()) continue;
if (bestEntry == null) {
bestEntry = m.top();
bestEntry = m.peek();
continue;
}
o = m.top();
if (o.weight.longValue() < bestEntry.weight.longValue()) {
o = m.peek();
if (o.getWeight() < bestEntry.getWeight()) {
bestEntry = o;
}
}
}
if (bestEntry == null) return null;
// finally remove the best entry from the doubledom cache
m = this.doubleDomCache.get(new String(bestEntry.element.metadataHash()).substring(6));
o = m.pop();
m = this.doubleDomCache.get(new String(bestEntry.getElement().metadataHash()).substring(6));
o = m.poll();
//assert o == null || o.element.metadataHash().equals(bestEntry.element.metadataHash()) : "bestEntry.element.metadataHash() = " + bestEntry.element.metadataHash() + ", o.element.metadataHash() = " + o.element.metadataHash();
return bestEntry;
}
Expand All @@ -382,17 +371,17 @@ public URIMetadataRow takeURL(final boolean skipDoubleDom, final int timeout) {
int p = -1;
byte[] urlhash;
while (System.currentTimeMillis() < timeLimit) {
final SortStack<WordReferenceVars>.stackElement obrwi = takeRWI(skipDoubleDom);
final ReverseElement<WordReferenceVars> obrwi = takeRWI(skipDoubleDom);
if (obrwi == null) {
if (this.feedingIsFinished()) return null;
try {Thread.sleep(50);} catch (final InterruptedException e1) {}
continue;
}
urlhash = obrwi.element.metadataHash();
final URIMetadataRow page = this.query.getSegment().urlMetadata().load(urlhash, obrwi.element, obrwi.weight.longValue());
urlhash = obrwi.getElement().metadataHash();
final URIMetadataRow page = this.query.getSegment().urlMetadata().load(urlhash, obrwi.getElement(), obrwi.getWeight());
if (page == null) {
try {
misses.put(obrwi.element.metadataHash());
misses.put(obrwi.getElement().metadataHash());
} catch (RowSpaceExceededException e) {
Log.logException(e);
}
Expand Down Expand Up @@ -494,17 +483,16 @@ public URIMetadataRow takeURL(final boolean skipDoubleDom, final int timeout) {
}

protected int size() {
//assert sortedRWIEntries.size() == urlhashes.size() : "sortedRWIEntries.size() = " + sortedRWIEntries.size() + ", urlhashes.size() = " + urlhashes.size();
int c = stack.size();
for (SortStack<WordReferenceVars> s: this.doubleDomCache.values()) {
c += s.size();
int c = stack.sizeAvailable();
for (WeakPriorityBlockingQueue<ReverseElement<WordReferenceVars>> s: this.doubleDomCache.values()) {
c += s.sizeAvailable();
}
return c;
}

public boolean isEmpty() {
if (!stack.isEmpty()) return false;
for (SortStack<WordReferenceVars> s: this.doubleDomCache.values()) {
for (WeakPriorityBlockingQueue<ReverseElement<WordReferenceVars>> s: this.doubleDomCache.values()) {
if (!s.isEmpty()) return false;
}
return true;
Expand All @@ -518,7 +506,7 @@ public int[] flagCount() {

public int filteredCount() {
// the number of index entries that are considered as result set
return this.stack.size();
return this.stack.sizeAvailable();
}

public int getLocalIndexCount() {
Expand Down Expand Up @@ -546,11 +534,6 @@ public int getRemotePeerCount() {
return this.remote_peerCount;
}

protected void remove(final WordReferenceVars reference) {
stack.remove(reference);
urlhashes.remove(reference.urlHash);
}

public Iterator<byte[]> miss() {
return this.misses.iterator();
}
Expand Down
7 changes: 6 additions & 1 deletion source/de/anomic/search/ReferenceOrder.java
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,11 @@ public int authority(final byte[] urlHash) {
return (doms.getScore(new String(urlHash, 6, 6)) << 8) / (1 + this.maxdomcount);
}

/**
* return the ranking of a given word entry
* @param t
* @return a ranking: the higher the number, the better is the ranking
*/
public long cardinal(final WordReferenceVars t) {
//return Long.MAX_VALUE - preRanking(ranking, iEntry, this.entryMin, this.entryMax, this.searchWords);
// the normalizedEntry must be a normalized indexEntry
Expand Down Expand Up @@ -247,7 +252,7 @@ public long cardinal(final WordReferenceVars t) {

//if (searchWords != null) r += (yacyURL.probablyWordURL(t.urlHash(), searchWords) != null) ? 256 << ranking.coeff_appurl : 0;

return Long.MAX_VALUE - r; // returns a reversed number: the lower the number the better the ranking. This is used for simple sorting with a TreeMap
return r; // the higher the number the better the ranking.
}

private static final String patchUK(String l) {
Expand Down
Loading

0 comments on commit 348dece

Please sign in to comment.