Skip to content

Commit

Permalink
separated rwi constraint evaluation from rwi ranking and added concur…
Browse files Browse the repository at this point in the history
…rency

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6274 6c8d7289-2bf4-0310-a012-ef5d649a1542
  • Loading branch information
orbiter committed Aug 27, 2009
1 parent ce7924d commit 0ba1bea
Show file tree
Hide file tree
Showing 2 changed files with 154 additions and 110 deletions.
114 changes: 72 additions & 42 deletions source/de/anomic/kelondro/text/ReferenceOrder.java
Expand Up @@ -26,11 +26,11 @@

package de.anomic.kelondro.text;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;

import de.anomic.document.Condenser;
import de.anomic.kelondro.order.Bitfield;
Expand All @@ -43,8 +43,8 @@
import de.anomic.yacy.yacyURL;

public class ReferenceOrder {
private WordReferenceVars min, max;
private final RankingProfile ranking;
private WordReferenceVars min, max;
private final RankingProfile ranking;
private final ScoreCluster<String> doms; // collected for "authority" heuristic
private int maxdomcount;
private String language;
Expand All @@ -58,8 +58,75 @@ public ReferenceOrder(final RankingProfile profile, String language) {
this.language = language;
}

public class Normalizer extends Thread {

private ReferenceContainer<WordReference> container;
private BlockingQueue<WordReferenceVars> decodedEntries;

public Normalizer(final ReferenceContainer<WordReference> container) {
// normalize ranking: find minimum and maximum of separate ranking criteria
assert (container != null);
this.container = container;
this.decodedEntries = new LinkedBlockingQueue<WordReferenceVars>();
}

public void run() {
BlockingQueue<WordReferenceVars> vars = WordReferenceVars.transform(container);

WordReferenceVars entryMin = null;
WordReferenceVars entryMax = null;
HashMap<String, Integer> doms0 = new HashMap<String, Integer>();
Integer int1 = 1;

WordReferenceVars iEntry;
String dom;
Integer count;
try {
while ((iEntry = vars.take()) != WordReferenceVars.poison) {
decodedEntries.put(iEntry);
// find min/max
if (entryMin == null) entryMin = iEntry.clone(); else entryMin.min(iEntry);
if (entryMax == null) entryMax = iEntry.clone(); else entryMax.max(iEntry);
// update domcount
dom = iEntry.metadataHash().substring(6);
count = doms0.get(dom);
if (count == null) {
doms0.put(dom, int1);
} else {
doms0.put(dom, Integer.valueOf(count.intValue() + 1));
}
}
} catch (InterruptedException e) {}

if (min == null) min = entryMin.clone(); else min.min(entryMin);
if (max == null) max = entryMax.clone(); else max.max(entryMax);
Map.Entry<String, Integer> entry;
final Iterator<Map.Entry<String, Integer>> di = doms0.entrySet().iterator();
while (di.hasNext()) {
entry = di.next();
doms.addScore(entry.getKey(), (entry.getValue()).intValue());
}

if (doms.size() > 0) maxdomcount = doms.getMaxScore();
try {
decodedEntries.put(WordReferenceVars.poison);
} catch (InterruptedException e) {}
}

public BlockingQueue<WordReferenceVars> decoded() {
return this.decodedEntries;
}
}

public BlockingQueue<WordReferenceVars> normalizeWith(final ReferenceContainer<WordReference> container) {
Normalizer n = new Normalizer(container);
n.start();
return n.decoded();
}

/*
public ArrayList<WordReferenceVars> normalizeWith(final ReferenceContainer<WordReference> container) {
// normalize ranking: find minimum and maxiumum of separate ranking criteria
// normalize ranking: find minimum and maximum of separate ranking criteria
assert (container != null);
BlockingQueue<WordReferenceVars> vars = WordReferenceVars.transform(container);
Expand Down Expand Up @@ -101,6 +168,7 @@ public ArrayList<WordReferenceVars> normalizeWith(final ReferenceContainer<WordR
if (this.doms.size() > 0) this.maxdomcount = this.doms.getMaxScore();
return decodedEntries;
}
*/

public int authority(final String urlHash) {
return (doms.getScore(urlHash.substring(6)) << 8) / (1 + this.maxdomcount);
Expand Down Expand Up @@ -154,42 +222,4 @@ private static final String patchUK(String l) {
// this is to patch a bad language name setting that was used in 0.60 and before
if (l.equals("uk")) return "en"; else return l;
}

public static class minmaxfinder {

private WordReferenceVars entryMin;
private WordReferenceVars entryMax;
private final HashMap<String, Integer> doms;
private final Integer int1;
private final ArrayList<WordReferenceVars> decodedEntries;

public minmaxfinder(final BlockingQueue<WordReferenceVars> vars) {
this.doms = new HashMap<String, Integer>();
this.int1 = 1;
this.decodedEntries = new ArrayList<WordReferenceVars>();
this.entryMin = null;
this.entryMax = null;
WordReferenceVars iEntry;
String dom;
Integer count;
try {
while ((iEntry = vars.take()) != WordReferenceVars.poison) {
this.decodedEntries.add(iEntry);
// find min/max
if (this.entryMin == null) this.entryMin = iEntry.clone(); else this.entryMin.min(iEntry);
if (this.entryMax == null) this.entryMax = iEntry.clone(); else this.entryMax.max(iEntry);
// update domcount
dom = iEntry.metadataHash().substring(6);
count = doms.get(dom);
if (count == null) {
doms.put(dom, int1);
} else {
doms.put(dom, Integer.valueOf(count.intValue() + 1));
}
}
} catch (InterruptedException e) {}
}

}

}
150 changes: 82 additions & 68 deletions source/de/anomic/search/RankingProcess.java
Expand Up @@ -37,6 +37,7 @@
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ConcurrentHashMap;

import de.anomic.document.Condenser;
Expand Down Expand Up @@ -165,83 +166,96 @@ public void add(final ReferenceContainer<WordReference> index, final boolean loc
long timer = System.currentTimeMillis();

// normalize entries
final ArrayList<WordReferenceVars> decodedEntries = this.order.normalizeWith(index);
final BlockingQueue<WordReferenceVars> decodedEntries = this.order.normalizeWith(index);
serverProfiling.update("SEARCH", new ProfilingGraph.searchEvent(query.id(true), SearchEvent.NORMALIZING, index.size(), System.currentTimeMillis() - timer), false);

// iterate over normalized entries and select some that are better than currently stored
timer = System.currentTimeMillis();
Long r;
HostInfo hs;
String domhash;
boolean nav_hosts = this.query.navigators.equals("all") || this.query.navigators.indexOf("hosts") >= 0;
for (WordReferenceVars iEntry: decodedEntries) {
assert (iEntry.metadataHash().length() == index.row().primaryKeyLength);
//if (iEntry.urlHash().length() != index.row().primaryKeyLength) continue;
WordReferenceVars iEntry;
final ArrayList<WordReferenceVars> filteredEntries = new ArrayList<WordReferenceVars>();
// apply all filter
try {
while ((iEntry = decodedEntries.take()) != WordReferenceVars.poison) {
assert (iEntry.metadataHash().length() == index.row().primaryKeyLength);
//if (iEntry.urlHash().length() != index.row().primaryKeyLength) continue;

// increase flag counts
for (int j = 0; j < 32; j++) {
if (iEntry.flags().get(j)) {flagcount[j]++;}
}

// kick out entries that are too bad according to current findings
r = Long.valueOf(order.cardinal(iEntry));
if ((maxentries >= 0) && (stack.size() >= maxentries) && (stack.bottom(r.longValue()))) continue;

// check constraints
if (!testFlags(iEntry)) continue;

// check document domain
if (query.contentdom != QueryParams.CONTENTDOM_TEXT) {
if ((query.contentdom == QueryParams.CONTENTDOM_AUDIO) && (!(iEntry.flags().get(Condenser.flag_cat_hasaudio)))) continue;
if ((query.contentdom == QueryParams.CONTENTDOM_VIDEO) && (!(iEntry.flags().get(Condenser.flag_cat_hasvideo)))) continue;
if ((query.contentdom == QueryParams.CONTENTDOM_IMAGE) && (!(iEntry.flags().get(Condenser.flag_cat_hasimage)))) continue;
if ((query.contentdom == QueryParams.CONTENTDOM_APP ) && (!(iEntry.flags().get(Condenser.flag_cat_hasapp )))) continue;
}
// increase flag counts
for (int j = 0; j < 32; j++) {
if (iEntry.flags().get(j)) {flagcount[j]++;}
}

// check constraints
if (!testFlags(iEntry)) continue;

// check document domain
if (query.contentdom != QueryParams.CONTENTDOM_TEXT) {
if ((query.contentdom == QueryParams.CONTENTDOM_AUDIO) && (!(iEntry.flags().get(Condenser.flag_cat_hasaudio)))) continue;
if ((query.contentdom == QueryParams.CONTENTDOM_VIDEO) && (!(iEntry.flags().get(Condenser.flag_cat_hasvideo)))) continue;
if ((query.contentdom == QueryParams.CONTENTDOM_IMAGE) && (!(iEntry.flags().get(Condenser.flag_cat_hasimage)))) continue;
if ((query.contentdom == QueryParams.CONTENTDOM_APP ) && (!(iEntry.flags().get(Condenser.flag_cat_hasapp )))) continue;
}

// check tld domain
if (!yacyURL.matchesAnyDomDomain(iEntry.metadataHash(), this.query.zonecode)) {
// filter out all tld that do not match with wanted tld domain
continue;
}

// check site constraints
if (query.sitehash != null && !iEntry.metadataHash().substring(6).equals(query.sitehash)) {
// filter out all domains that do not match with the site constraint
continue;
}

// count domZones
this.domZones[yacyURL.domDomain(iEntry.metadataHash())]++;

// get statistics for host navigator
if (nav_hosts) {
domhash = iEntry.urlHash.substring(6);
hs = this.hostNavigator.get(domhash);
if (hs == null) {
this.hostNavigator.put(domhash, new HostInfo(iEntry.urlHash));
} else {
hs.inc();
}
}

// insert
if ((maxentries < 0) || (stack.size() < maxentries)) {
// in case that we don't have enough yet, accept any new entry
if (urlhashes.containsKey(iEntry.metadataHash())) continue;
stack.push(iEntry, r);
} else {
// if we already have enough entries, insert only such that are necessary to get a better result
if (stack.bottom(r.longValue())) {
continue;
}
// double-check
if (urlhashes.containsKey(iEntry.metadataHash())) continue;
stack.push(iEntry, r);
}

// increase counter for statistics
if (!local) this.remote_indexCount++;
}
// check tld domain
if (!yacyURL.matchesAnyDomDomain(iEntry.metadataHash(), this.query.zonecode)) {
// filter out all tld that do not match with wanted tld domain
continue;
}

// check site constraints
if (query.sitehash != null && !iEntry.metadataHash().substring(6).equals(query.sitehash)) {
// filter out all domains that do not match with the site constraint
continue;
}

// count domZones
this.domZones[yacyURL.domDomain(iEntry.metadataHash())]++;

// get statistics for host navigator
if (nav_hosts) {
domhash = iEntry.urlHash.substring(6);
hs = this.hostNavigator.get(domhash);
if (hs == null) {
this.hostNavigator.put(domhash, new HostInfo(iEntry.urlHash));
} else {
hs.inc();
}
}

// accept
filteredEntries.add(iEntry);

// increase counter for statistics
if (!local) this.remote_indexCount++;
}
} catch (InterruptedException e) {}

// do the ranking
Long r;
for (WordReferenceVars fEntry: filteredEntries) {

// kick out entries that are too bad according to current findings
r = Long.valueOf(order.cardinal(fEntry));
if ((maxentries >= 0) && (stack.size() >= maxentries) && (stack.bottom(r.longValue()))) continue;

// insert
if ((maxentries < 0) || (stack.size() < maxentries)) {
// in case that we don't have enough yet, accept any new entry
if (urlhashes.containsKey(fEntry.metadataHash())) continue;
stack.push(fEntry, r);
} else {
// if we already have enough entries, insert only such that are necessary to get a better result
if (stack.bottom(r.longValue())) {
continue;
}
// double-check
if (urlhashes.containsKey(fEntry.metadataHash())) continue;
stack.push(fEntry, r);
}

}

//if ((query.neededResults() > 0) && (container.size() > query.neededResults())) remove(true, true);
serverProfiling.update("SEARCH", new ProfilingGraph.searchEvent(query.id(true), SearchEvent.PRESORT, index.size(), System.currentTimeMillis() - timer), false);
Expand Down

0 comments on commit 0ba1bea

Please sign in to comment.