Skip to content

Commit

Permalink
some redesign of min/max and normalization computation during search …
Browse files Browse the repository at this point in the history
…result ordering

this saves about 1 millisecond for each URL reference, which has some good effect
on the search result computation if a word is searched that appears very often
(speed-up of 1 second and more)

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4033 6c8d7289-2bf4-0310-a012-ef5d649a1542
  • Loading branch information
orbiter committed Aug 6, 2007
1 parent 9678d1b commit 5c1b444
Show file tree
Hide file tree
Showing 5 changed files with 72 additions and 69 deletions.
2 changes: 1 addition & 1 deletion build.properties
Expand Up @@ -3,7 +3,7 @@ javacSource=1.4
javacTarget=1.4

# Release Configuration
releaseVersion=0.541
releaseVersion=0.542
releaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
proReleaseFile=yacy_pro_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
releaseFileParentDir=yacy
Expand Down
84 changes: 27 additions & 57 deletions source/de/anomic/index/indexRWIEntry.java
Expand Up @@ -194,7 +194,7 @@ public int quality() {
}

public int virtualAge() {
return plasmaWordIndex.microDateDays(lastModified());
return (int) this.entry.getColLong(col_lastModified); // this is the time in MicoDateDays format
}

public long lastModified() {
Expand Down Expand Up @@ -284,31 +284,35 @@ public int worddistance() {
}

public static final void min(indexRWIEntry t, indexRWIEntry other) {
if (t.hitcount() > other.hitcount()) t.entry.setCol(col_hitcount, other.hitcount());
if (t.wordsintext() > other.wordsintext()) t.entry.setCol(col_wordsInText, other.wordsintext());
if (t.phrasesintext() > other.phrasesintext()) t.entry.setCol(col_phrasesInText, other.phrasesintext());
if (t.posintext() > other.posintext()) t.entry.setCol(col_posintext, other.posintext());
if (t.posinphrase() > other.posinphrase()) t.entry.setCol(col_posinphrase, other.posinphrase());
if (t.posofphrase() > other.posofphrase()) t.entry.setCol(col_posofphrase, other.posofphrase());
if (t.worddistance() > other.worddistance()) t.entry.setCol(col_worddistance, other.worddistance());
if (t.lastModified() > other.lastModified()) t.entry.setCol(col_lastModified, other.lastModified());
if (t.urllength() > other.urllength()) t.entry.setCol(col_urlLength, other.urllength());
if (t.urlcomps() > other.urlcomps()) t.entry.setCol(col_urlComps, other.urlcomps());
if (t.wordsintitle() > other.wordsintitle() ) t.entry.setCol(col_wordsInTitle, other.wordsintitle());
int v;
long w;
if (t.hitcount() > (v = other.hitcount())) t.entry.setCol(col_hitcount, other.hitcount());
if (t.wordsintext() > (v = other.wordsintext())) t.entry.setCol(col_wordsInText, v);
if (t.phrasesintext() > (v = other.phrasesintext())) t.entry.setCol(col_phrasesInText, v);
if (t.posintext() > (v = other.posintext())) t.entry.setCol(col_posintext, v);
if (t.posinphrase() > (v = other.posinphrase())) t.entry.setCol(col_posinphrase, v);
if (t.posofphrase() > (v = other.posofphrase())) t.entry.setCol(col_posofphrase, v);
if (t.worddistance() > (v = other.worddistance())) t.entry.setCol(col_worddistance, v);
if (t.lastModified() > (w = other.lastModified())) t.entry.setCol(col_lastModified, w);
if (t.urllength() > (v = other.urllength())) t.entry.setCol(col_urlLength, v);
if (t.urlcomps() > (v = other.urlcomps())) t.entry.setCol(col_urlComps, v);
if (t.wordsintitle() > (v = other.wordsintitle())) t.entry.setCol(col_wordsInTitle, v);
}

public static final void max(indexRWIEntry t, indexRWIEntry other) {
if (t.hitcount() < other.hitcount()) t.entry.setCol(col_hitcount, other.hitcount());
if (t.wordsintext() < other.wordsintext()) t.entry.setCol(col_wordsInText, other.wordsintext());
if (t.phrasesintext() < other.phrasesintext()) t.entry.setCol(col_phrasesInText, other.phrasesintext());
if (t.posintext() < other.posintext()) t.entry.setCol(col_posintext, other.posintext());
if (t.posinphrase() < other.posinphrase()) t.entry.setCol(col_posinphrase, other.posinphrase());
if (t.posofphrase() < other.posofphrase()) t.entry.setCol(col_posofphrase, other.posofphrase());
if (t.worddistance() < other.worddistance()) t.entry.setCol(col_worddistance, other.worddistance());
if (t.lastModified() < other.lastModified()) t.entry.setCol(col_lastModified, other.lastModified());
if (t.urllength() < other.urllength()) t.entry.setCol(col_urlLength, other.urllength());
if (t.urlcomps() < other.urlcomps()) t.entry.setCol(col_urlComps, other.urlcomps());
if (t.wordsintitle() < other.wordsintitle() ) t.entry.setCol(col_wordsInTitle, other.wordsintitle());
int v;
long w;
if (t.hitcount() < (v = other.hitcount())) t.entry.setCol(col_hitcount, v);
if (t.wordsintext() < (v = other.wordsintext())) t.entry.setCol(col_wordsInText, v);
if (t.phrasesintext() < (v = other.phrasesintext())) t.entry.setCol(col_phrasesInText, v);
if (t.posintext() < (v = other.posintext())) t.entry.setCol(col_posintext, v);
if (t.posinphrase() < (v = other.posinphrase())) t.entry.setCol(col_posinphrase, v);
if (t.posofphrase() < (v = other.posofphrase())) t.entry.setCol(col_posofphrase, v);
if (t.worddistance() < (v = other.worddistance())) t.entry.setCol(col_worddistance, v);
if (t.lastModified() < (w = other.lastModified())) t.entry.setCol(col_lastModified, w);
if (t.urllength() < (v = other.urllength())) t.entry.setCol(col_urlLength, v);
if (t.urlcomps() < (v = other.urlcomps())) t.entry.setCol(col_urlComps, v);
if (t.wordsintitle() < (v = other.wordsintitle())) t.entry.setCol(col_wordsInTitle, v);
}


Expand All @@ -319,40 +323,6 @@ public void min(indexRWIEntry other) {
public void max(indexRWIEntry other) {
max(this, other);
}

static void normalize(indexRWIEntry t, indexRWIEntry min, indexRWIEntry max) {
assert (t.urlHash().length() == 12) : "turlhash = " + t.urlHash();
assert (min.urlHash().length() == 12) : "minurlhash = " + min.urlHash();
assert (max.urlHash().length() == 12) : "maxurlhash = " + max.urlHash();
if (1 + max.worddistance() - min.worddistance() == 0) System.out.println("min = " + min.toPropertyForm() + "\nmax=" + max.toPropertyForm());
//System.out.println("Normalize:\nentry = " + t.toPropertyForm(true));
//System.out.println("min = " + min.toPropertyForm(true));
//System.out.println("max = " + max.toPropertyForm(true));
t.entry.setCol(col_hitcount , (t.hitcount() == 0) ? 0 : 1 + 255 * (t.hitcount() - min.hitcount() ) / (1 + max.hitcount() - min.hitcount()));
t.entry.setCol(col_wordsInText , (t.wordsintext() == 0) ? 0 : 1 + 255 * (t.wordsintext() - min.wordsintext() ) / (1 + max.wordsintext() - min.wordsintext()));
t.entry.setCol(col_phrasesInText, (t.phrasesintext() == 0) ? 0 : 1 + 255 * (t.phrasesintext() - min.phrasesintext() ) / (1 + max.phrasesintext() - min.phrasesintext()));
t.entry.setCol(col_posintext , (t.posintext() == 0) ? 0 : 1 + 255 * (t.posintext() - min.posintext() ) / (1 + max.posintext() - min.posintext()));
t.entry.setCol(col_posinphrase , (t.posinphrase() == 0) ? 0 : 1 + 255 * (t.posinphrase() - min.posinphrase() ) / (1 + max.posinphrase() - min.posinphrase()));
t.entry.setCol(col_posofphrase , (t.posofphrase() == 0) ? 0 : 1 + 255 * (t.posofphrase() - min.posofphrase() ) / (1 + max.posofphrase() - min.posofphrase()));
t.entry.setCol(col_worddistance , (t.worddistance() == 0) ? 0 : 1 + 255 * (t.worddistance() - min.worddistance()) / (1 + max.worddistance() - min.worddistance())); // FIXME: hier gibts ein division by zero, was nur sein kann wenn die Normalisierung nicht geklappt hat.
t.entry.setCol(col_lastModified , (t.lastModified() == 0) ? 0 : 1 + 255 * (t.lastModified() - min.lastModified()) / (1 + max.lastModified() - min.lastModified()));
t.entry.setCol(col_urlLength , (t.urllength() == 0) ? 0 : 1 + 255 * (t.urllength() - min.urllength() ) / (1 + max.urllength() - min.urllength()));
t.entry.setCol(col_urlComps , (t.urlcomps() == 0) ? 0 : 1 + 255 * (t.urlcomps() - min.urlcomps() ) / (1 + max.urlcomps() - min.urlcomps()));
t.entry.setCol(col_wordsInTitle , (t.wordsintitle() == 0) ? 0 : 1 + 255 * (t.wordsintitle() - min.wordsintitle()) / (1 + max.wordsintitle() - min.wordsintitle()));

//System.out.println("out = " + t.toPropertyForm(true));
}

public void normalize(indexRWIEntry min, indexRWIEntry max) {
normalize(this, min, max);
}

public indexRWIEntry generateNormalized(indexRWIEntry min, indexRWIEntry max) {
assert (this.urlHash().length() == 12) : "this.urlhash = " + this.urlHash();
indexRWIEntry e = (indexRWIEntry) this.clone();
e.normalize(min, max);
return e;
}

public boolean isNewer(indexRWIEntry other) {
if (other == null) return true;
Expand Down
2 changes: 1 addition & 1 deletion source/de/anomic/plasma/plasmaSearchPreOrder.java
Expand Up @@ -110,7 +110,7 @@ public plasmaSearchPreOrder(plasmaSearchQuery query, plasmaSearchRankingProfile
if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) && (!(iEntry.flags().get(plasmaCondenser.flag_cat_hasimage)))) continue;
if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_APP ) && (!(iEntry.flags().get(plasmaCondenser.flag_cat_hasapp )))) continue;
}
pageAcc.put(serverCodings.encodeHex(Long.MAX_VALUE - this.ranking.preRanking(iEntry.generateNormalized(this.entryMin, this.entryMax), searchWords), 16) + iEntry.urlHash(), iEntry);
pageAcc.put(serverCodings.encodeHex(Long.MAX_VALUE - this.ranking.preRanking(iEntry, this.entryMin, this.entryMax, searchWords), 16) + iEntry.urlHash(), iEntry);
}
this.filteredCount = pageAcc.size();
}
Expand Down
2 changes: 1 addition & 1 deletion source/de/anomic/plasma/plasmaSearchProcessing.java
Expand Up @@ -380,7 +380,7 @@ public plasmaSearchPostOrder urlFetch(
int minEntries = getTargetCount(plasmaSearchProcessing.PROCESS_POSTSORT);
try {
ordering: while (preorder.hasNext()) {
if ((System.currentTimeMillis() >= postorderLimitTime) || (acc.sizeFetched() >= minEntries)) break;
if ((System.currentTimeMillis() >= postorderLimitTime) || (acc.sizeFetched() >= 5 * minEntries)) break;
preorderEntry = preorder.next();
entry = (indexRWIEntry) preorderEntry[0];
// load only urls if there was not yet a root url of that hash
Expand Down
51 changes: 42 additions & 9 deletions source/de/anomic/plasma/plasmaSearchRankingProfile.java
Expand Up @@ -89,7 +89,7 @@ public class plasmaSearchRankingProfile {
private int
coeff_domlength, coeff_ybr, coeff_date, coeff_wordsintitle, coeff_wordsintext, coeff_phrasesintext,
coeff_llocal, coeff_lother, coeff_urllength, coeff_urlcomps, coeff_hitcount,
coeff_posintext, coeff_posofphrase, coeff_worddistance,
coeff_posintext, coeff_posofphrase, coeff_posinphrase, coeff_worddistance,
coeff_appurl, coeff_appdescr, coeff_appauthor, coeff_apptags, coeff_appref, coeff_appemph,
coeff_catindexof, coeff_cathasimage, coeff_cathasaudio, coeff_cathasvideo, coeff_cathasapp,
coeff_urlcompintoplist, coeff_descrcompintoplist, coeff_prefer;
Expand All @@ -110,6 +110,7 @@ public plasmaSearchRankingProfile(String mediatype) {
coeff_hitcount = 5;
coeff_posintext = 7;
coeff_posofphrase = 6;
coeff_posinphrase = 1;
coeff_worddistance = 15;
coeff_appurl = 14;
coeff_appdescr = 13;
Expand Down Expand Up @@ -249,6 +250,44 @@ public String toExternalURLGet(String prefix) {
return new String(ext);
}

public long preRanking(indexRWIEntry t, indexRWIEntry min, indexRWIEntry max, TreeSet searchedWords) {
// the normalizedEntry must be a normalized indexEntry
long ranking = 0;
ranking += (256 - plasmaURL.domLengthNormalized(t.urlHash())) << coeff_domlength;
ranking += plasmaSearchPreOrder.ybr_p(t.urlHash()) << coeff_ybr;
ranking += (255 - (255 * (t.virtualAge() - min.virtualAge() ) / (1 + max.virtualAge() - min.virtualAge())) ) << coeff_date;
ranking += (255 * (t.wordsintitle() - min.wordsintitle() ) / (1 + max.wordsintitle() - min.wordsintitle())) << coeff_wordsintitle;
ranking += (255 * (t.wordsintext() - min.wordsintext() ) / (1 + max.wordsintext() - min.wordsintext())) << coeff_wordsintext;
ranking += (255 * (t.phrasesintext()- min.phrasesintext()) / (1 + max.phrasesintext()- min.phrasesintext())) << coeff_phrasesintext;
ranking += t.llocal() << coeff_llocal;
ranking += t.lother() << coeff_lother;
ranking += (255 - (255 * (t.urllength() - min.urllength() ) / (1 + max.urllength() - min.urllength())) ) << coeff_urllength;
ranking += (255 - (255 * (t.urlcomps() - min.urlcomps() ) / (1 + max.urlcomps() - min.urlcomps())) ) << coeff_urlcomps;
ranking += (255 * (t.hitcount() - min.hitcount() ) / (1 + max.hitcount() - min.hitcount())) << coeff_hitcount;
ranking += (255 - (255 * (t.posintext() - min.posintext() ) / (1 + max.posintext() - min.posintext())) ) << coeff_posintext;
ranking += (255 - (255 * (t.posofphrase() - min.posofphrase() ) / (1 + max.posofphrase() - min.posofphrase())) ) << coeff_posofphrase;
ranking += (255 - (255 * (t.posinphrase() - min.posinphrase() ) / (1 + max.posinphrase() - min.posinphrase())) ) << coeff_posinphrase;
ranking += (255 - (255 * (t.worddistance() - min.worddistance() ) / (1 + max.worddistance() - min.worddistance()))) << coeff_worddistance;

kelondroBitfield flags = t.flags();
ranking += (flags.get(indexRWIEntry.flag_app_url)) ? 256 << coeff_appurl : 0;
ranking += (flags.get(indexRWIEntry.flag_app_descr)) ? 256 << coeff_appdescr : 0;
ranking += (flags.get(indexRWIEntry.flag_app_author)) ? 256 << coeff_appauthor : 0;
ranking += (flags.get(indexRWIEntry.flag_app_tags)) ? 256 << coeff_apptags : 0;
ranking += (flags.get(indexRWIEntry.flag_app_reference)) ? 256 << coeff_appref : 0;
ranking += (flags.get(indexRWIEntry.flag_app_emphasized)) ? 256 << coeff_appemph : 0;
ranking += (flags.get(plasmaCondenser.flag_cat_indexof)) ? 256 << coeff_catindexof : 0;
ranking += (flags.get(plasmaCondenser.flag_cat_hasimage)) ? 256 << coeff_cathasimage : 0;
ranking += (flags.get(plasmaCondenser.flag_cat_hasaudio)) ? 256 << coeff_cathasaudio : 0;
ranking += (flags.get(plasmaCondenser.flag_cat_hasvideo)) ? 256 << coeff_cathasvideo : 0;
ranking += (flags.get(plasmaCondenser.flag_cat_hasapp)) ? 256 << coeff_cathasapp : 0;

ranking += (plasmaURL.probablyRootURL(t.urlHash())) ? 16 << coeff_urllength : 0;
ranking += (plasmaURL.probablyWordURL(t.urlHash(), searchedWords) != null) ? 256 << coeff_appurl : 0;

return ranking;
}
/*
public long preRanking(indexRWIEntry normalizedEntry, TreeSet searchedWords) {
// the normalizedEntry must be a normalized indexEntry
long ranking = 0;
Expand Down Expand Up @@ -282,16 +321,10 @@ public long preRanking(indexRWIEntry normalizedEntry, TreeSet searchedWords) {
ranking += (plasmaURL.probablyRootURL(normalizedEntry.urlHash())) ? 16 << coeff_urllength : 0;
ranking += (plasmaURL.probablyWordURL(normalizedEntry.urlHash(), searchedWords) != null) ? 256 << coeff_appurl : 0;

/*
if (indexURL.probablyWordURL(normalizedEntry.urlHash(), searchedWord))
System.out.println("DEBUG - hash " + normalizedEntry.urlHash() + " contains word " + searchedWord + ", weighted " + ((Integer) coeff.get(QUERYINURL)).intValue() + ", ranking = " + ranking);
else
System.out.println("DEBUG - hash " + normalizedEntry.urlHash() + " contains not word " + searchedWord + ", ranking = " + ranking);
*/
return ranking;
}

*/
public long postRanking(
long ranking,
plasmaSearchQuery query,
Expand Down

0 comments on commit 5c1b444

Please sign in to comment.