Skip to content

Commit

Permalink
added term-frequency ranking
Browse files Browse the repository at this point in the history
git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4413 6c8d7289-2bf4-0310-a012-ef5d649a1542
  • Loading branch information
orbiter committed Jan 28, 2008
1 parent 1a296af commit 974fea7
Show file tree
Hide file tree
Showing 8 changed files with 115 additions and 92 deletions.
2 changes: 2 additions & 0 deletions htroot/IndexControlRWIs_p.html
Expand Up @@ -152,6 +152,7 @@ <h2>Index Administration</h2>
<td>pos of phrase</td>
<td>pos in phrase</td>
<td>word distance</td>
<td>term frequency</td>
<td>authority</td>
<td>date</td>
<td>words in title</td>
Expand Down Expand Up @@ -181,6 +182,7 @@ <h2>Index Administration</h2>
<td class="TableCellDark">#[phrase]#</td>
<td class="TableCellDark">#[posinphrase]#</td>
<td class="TableCellDark">#[worddistance]#</td>
<td>#[tf]#</td>
<td>#[authority]#</td>
<td>#[date]#</td>
<td>#[wordsintitle]#</td>
Expand Down
3 changes: 2 additions & 1 deletion htroot/Ranking_p.java
Expand Up @@ -67,7 +67,8 @@ public class Ranking_p {
rankingParameters.put(plasmaSearchRankingProfile.POSOFPHRASE, "Position Of Phrase");
rankingParameters.put(plasmaSearchRankingProfile.POSINPHRASE, "Position In Phrase");
rankingParameters.put(plasmaSearchRankingProfile.PREFER, "Application Of Prefer Pattern");
rankingParameters.put(plasmaSearchRankingProfile.URLCOMPINTOPLIST, "URL Component Appears In Toplist");
rankingParameters.put(plasmaSearchRankingProfile.TERMFREQUENCY, "Term Frequency");
rankingParameters.put(plasmaSearchRankingProfile.URLCOMPINTOPLIST, "URL Component Appears In Toplist");
rankingParameters.put(plasmaSearchRankingProfile.URLCOMPS, "URL Components");
rankingParameters.put(plasmaSearchRankingProfile.URLLENGTH, "URL Length");
rankingParameters.put(plasmaSearchRankingProfile.WORDDISTANCE, "Word Distance");
Expand Down
44 changes: 23 additions & 21 deletions source/de/anomic/index/indexRWIEntryOrder.java
Expand Up @@ -39,10 +39,10 @@
import de.anomic.plasma.plasmaSearchRankingProfile;
import de.anomic.yacy.yacyURL;

public class indexRWIEntryOrder extends kelondroAbstractOrder<indexRWIEntry> implements kelondroOrder<indexRWIEntry> {
public class indexRWIEntryOrder extends kelondroAbstractOrder<indexRWIVarEntry> implements kelondroOrder<indexRWIVarEntry> {
private indexRWIVarEntry min, max;
private plasmaSearchRankingProfile ranking;
private kelondroMScoreCluster<String> doms;
private kelondroMScoreCluster<String> doms; // collected for "authority" heuristic
private int maxdomcount;

private static final int processors = Runtime.getRuntime().availableProcessors(); // for multiprocessor support, used during normalization
Expand All @@ -55,7 +55,8 @@ public indexRWIEntryOrder(plasmaSearchRankingProfile profile) {
this.maxdomcount = 0;
}

public void extend(indexContainer container) {
public void normalizeWith(indexContainer container) {
// normalize ranking: find minimum and maxiumum of separate ranking criteria
assert (container != null);

//long s0 = System.currentTimeMillis();
Expand Down Expand Up @@ -102,7 +103,7 @@ public void extend(indexContainer container) {
if (this.doms.size() > 0) this.maxdomcount = this.doms.getMaxScore();
}

public kelondroOrder<indexRWIEntry> clone() {
public kelondroOrder<indexRWIVarEntry> clone() {
return null;
}

Expand All @@ -111,14 +112,14 @@ public int authority(String urlHash) {
}

public long cardinal(byte[] key) {
return cardinal(new indexRWIRowEntry(key));
return cardinal(new indexRWIVarEntry(new indexRWIRowEntry(key)));
}

public long tf(indexRWIEntry t) {
return (t.hitcount() - min.hitcount()) * (1 + max.wordsintext() - min.wordsintext()) / (1 + max.hitcount() - min.hitcount()) / (t.wordsintext() - min.wordsintext());
public static final double termFrequency(indexRWIEntry t) {
return (((double) t.hitcount()) / ((double) (t.wordsintext() + t.wordsintitle() + 1)));
}

public long cardinal(indexRWIEntry t) {
public long cardinal(indexRWIVarEntry t) {
//return Long.MAX_VALUE - preRanking(ranking, iEntry, this.entryMin, this.entryMax, this.searchWords);
// the normalizedEntry must be a normalized indexEntry
kelondroBitfield flags = t.flags();
Expand All @@ -138,25 +139,26 @@ public long cardinal(indexRWIEntry t) {
+ ( (((t.llocal() - min.llocal() ) << 8) / (1 + max.llocal() - min.llocal()) ) << ranking.coeff_llocal)
+ ( (((t.lother() - min.lother() ) << 8) / (1 + max.lother() - min.lother()) ) << ranking.coeff_lother)
+ ( (((t.hitcount() - min.hitcount() ) << 8) / (1 + max.hitcount() - min.hitcount()) ) << ranking.coeff_hitcount)
+ (((int)((((t.termFrequency()- min.termFrequency() )*256.0)/ (1 + max.termFrequency()- min.termFrequency()))))<< ranking.coeff_termfrequency)
+ ( authority(t.urlHash()) << ranking.coeff_authority)
+ (((flags.get(indexRWIEntry.flag_app_dc_identifier)) ? 255 << ranking.coeff_appurl : 0))
+ (((flags.get(indexRWIEntry.flag_app_dc_title)) ? 255 << ranking.coeff_appdescr : 0))
+ (((flags.get(indexRWIEntry.flag_app_dc_identifier)) ? 255 << ranking.coeff_appurl : 0))
+ (((flags.get(indexRWIEntry.flag_app_dc_title)) ? 255 << ranking.coeff_appdescr : 0))
+ (((flags.get(indexRWIEntry.flag_app_dc_creator)) ? 255 << ranking.coeff_appauthor : 0))
+ (((flags.get(indexRWIEntry.flag_app_dc_subject)) ? 255 << ranking.coeff_apptags : 0))
+ (((flags.get(indexRWIEntry.flag_app_dc_description)) ? 255 << ranking.coeff_appref : 0))
+ (((flags.get(indexRWIEntry.flag_app_emphasized)) ? 255 << ranking.coeff_appemph : 0))
+ (((flags.get(plasmaCondenser.flag_cat_indexof)) ? 255 << ranking.coeff_catindexof : 0))
+ (((flags.get(plasmaCondenser.flag_cat_hasimage)) ? 255 << ranking.coeff_cathasimage : 0))
+ (((flags.get(plasmaCondenser.flag_cat_hasaudio)) ? 255 << ranking.coeff_cathasaudio : 0))
+ (((flags.get(plasmaCondenser.flag_cat_hasvideo)) ? 255 << ranking.coeff_cathasvideo : 0))
+ (((flags.get(plasmaCondenser.flag_cat_hasapp)) ? 255 << ranking.coeff_cathasapp : 0))
+ (((yacyURL.probablyRootURL(t.urlHash())) ? 15 << ranking.coeff_urllength : 0));
+ (((flags.get(indexRWIEntry.flag_app_dc_subject)) ? 255 << ranking.coeff_apptags : 0))
+ (((flags.get(indexRWIEntry.flag_app_dc_description)) ? 255 << ranking.coeff_appref : 0))
+ (((flags.get(indexRWIEntry.flag_app_emphasized)) ? 255 << ranking.coeff_appemph : 0))
+ (((flags.get(plasmaCondenser.flag_cat_indexof)) ? 255 << ranking.coeff_catindexof : 0))
+ (((flags.get(plasmaCondenser.flag_cat_hasimage)) ? 255 << ranking.coeff_cathasimage : 0))
+ (((flags.get(plasmaCondenser.flag_cat_hasaudio)) ? 255 << ranking.coeff_cathasaudio : 0))
+ (((flags.get(plasmaCondenser.flag_cat_hasvideo)) ? 255 << ranking.coeff_cathasvideo : 0))
+ (((flags.get(plasmaCondenser.flag_cat_hasapp)) ? 255 << ranking.coeff_cathasapp : 0))
+ (((yacyURL.probablyRootURL(t.urlHash())) ? 15 << ranking.coeff_urllength : 0));
//if (searchWords != null) r += (yacyURL.probablyWordURL(t.urlHash(), searchWords) != null) ? 256 << ranking.coeff_appurl : 0;

return Long.MAX_VALUE - r; // returns a reversed number: the lower the number the better the ranking. This is used for simple sorting with a TreeMap
}

public int compare(indexRWIEntry a, indexRWIEntry b) {
public int compare(indexRWIVarEntry a, indexRWIVarEntry b) {
long ca = cardinal(a);
long cb = cardinal(b);
return (ca > cb) ? 1 : (ca < cb) ? -1 : 0;
Expand All @@ -166,7 +168,7 @@ public String signature() {
return "rx";
}

public boolean wellformed(indexRWIEntry a) {
public boolean wellformed(indexRWIVarEntry a) {
return true;
}

Expand Down
13 changes: 12 additions & 1 deletion source/de/anomic/index/indexRWIVarEntry.java
Expand Up @@ -39,8 +39,9 @@ public class indexRWIVarEntry implements indexRWIEntry {
posinphrase, posofphrase,
quality, urlcomps, urllength, virtualAge,
worddistance, wordsintext, wordsintitle;
public double termFrequency;

public indexRWIVarEntry(indexRWIRowEntry e) {
public indexRWIVarEntry(indexRWIEntry e) {
this.flags = e.flags();
this.freshUntil = e.freshUntil();
this.lastModified = e.lastModified();
Expand All @@ -61,6 +62,7 @@ public indexRWIVarEntry(indexRWIRowEntry e) {
this.worddistance = e.worddistance();
this.wordsintext = e.wordsintext();
this.wordsintitle = e.wordsintitle();
this.termFrequency = ((double) e.hitcount()) / ((double) (e.wordsintext() + e.wordsintitle() + 1));
}

public void combineDistance(indexRWIEntry oe) {
Expand Down Expand Up @@ -166,6 +168,10 @@ public int wordsintitle() {
return wordsintitle;
}

public double termFrequency() {
return termFrequency;
}

public static final void min(indexRWIVarEntry t, indexRWIEntry other) {
int v;
long w;
Expand All @@ -185,6 +191,9 @@ public static final void min(indexRWIVarEntry t, indexRWIEntry other) {
if (t.urllength() > (v = other.urllength())) t.urllength = v;
if (t.urlcomps() > (v = other.urlcomps())) t.urlcomps = v;
if (t.wordsintitle() > (v = other.wordsintitle())) t.wordsintitle = v;
double tf = (other instanceof indexRWIVarEntry) ? ((indexRWIVarEntry) other).termFrequency : indexRWIEntryOrder.termFrequency(other);
if (t.termFrequency > tf) t.termFrequency = tf;

}

public static final void max(indexRWIVarEntry t, indexRWIEntry other) {
Expand All @@ -206,6 +215,8 @@ public static final void max(indexRWIVarEntry t, indexRWIEntry other) {
if (t.urllength() < (v = other.urllength())) t.urllength = v;
if (t.urlcomps() < (v = other.urlcomps())) t.urlcomps = v;
if (t.wordsintitle() < (v = other.wordsintitle())) t.wordsintitle = v;
double tf = (other instanceof indexRWIVarEntry) ? ((indexRWIVarEntry) other).termFrequency : indexRWIEntryOrder.termFrequency(other);
if (t.termFrequency < tf) t.termFrequency = tf;
}

}
2 changes: 2 additions & 0 deletions source/de/anomic/plasma/plasmaSearchAPI.java
Expand Up @@ -33,6 +33,7 @@

import de.anomic.data.listManager;
import de.anomic.index.indexRWIEntry;
import de.anomic.index.indexRWIEntryOrder;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.plasma.urlPattern.plasmaURLPattern;
Expand Down Expand Up @@ -149,6 +150,7 @@ public static void genURLList(serverObjects prop, String keyhash, String keystri
prop.putNum("genUrlList_urlList_"+i+"_urlExists_ranking", (entry.ranking() - rn));
prop.putNum("genUrlList_urlList_"+i+"_urlExists_domlength", yacyURL.domLengthEstimation(entry.hash()));
prop.putNum("genUrlList_urlList_"+i+"_urlExists_ybr", plasmaSearchRankingProcess.ybr(entry.hash()));
prop.putNum("genUrlList_urlList_"+i+"_urlExists_tf", indexRWIEntryOrder.termFrequency(entry.word()));
prop.putNum("genUrlList_urlList_"+i+"_urlExists_authority", (ranked.getOrder() == null) ? -1 : ranked.getOrder().authority(entry.hash()));
prop.put("genUrlList_urlList_"+i+"_urlExists_date", serverDate.formatShortDay(new Date(entry.word().lastModified())));
prop.putNum("genUrlList_urlList_"+i+"_urlExists_wordsintitle", entry.word().wordsintitle());
Expand Down
110 changes: 55 additions & 55 deletions source/de/anomic/plasma/plasmaSearchRankingProcess.java
Expand Up @@ -40,6 +40,7 @@
import de.anomic.index.indexRWIEntry;
import de.anomic.index.indexRWIEntryOrder;
import de.anomic.index.indexRWIRowEntry;
import de.anomic.index.indexRWIVarEntry;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBinSearch;
import de.anomic.kelondro.kelondroMScoreCluster;
Expand Down Expand Up @@ -113,81 +114,80 @@ public void execQuery(boolean fetchURLs) {
}

if (sortorder == 2) {
insert(index, true);
insertRanked(index, true);
} else {
final Iterator<indexRWIRowEntry> en = index.entries();
// generate a new map where the urls are sorted (not by hash but by the url text)

indexRWIEntry ientry;
indexURLEntry uentry;
String u;
loop: while (en.hasNext()) {
ientry = (indexRWIEntry) en.next();
insertNoOrder(index, fetchURLs);
}
}

private void insertNoOrder(indexContainer index, boolean local) {
final Iterator<indexRWIRowEntry> en = index.entries();
// generate a new map where the urls are sorted (not by hash but by the url text)

indexRWIEntry ientry;
indexURLEntry uentry;
String u;
loop: while (en.hasNext()) {
ientry = (indexRWIEntry) en.next();

// check constraints
if (!testFlags(ientry)) continue loop;

// increase flag counts
for (int i = 0; i < 32; i++) {
if (ientry.flags().get(i)) {flagcount[i]++;}
}

// load url
if (sortorder == 0) {
this.sortedRWIEntries.put(ientry.urlHash(), ientry);
this.urlhashes.put(ientry.urlHash(), ientry.urlHash());
filteredCount++;
} else {
if (fetchURLs) {
uentry = wordIndex.loadedURL.load(ientry.urlHash(), ientry, 0);
if (uentry == null) {
this.misses.add(ientry.urlHash());
} else {
u = uentry.comp().url().toNormalform(false, true);
this.sortedRWIEntries.put(u, ientry);
this.urlhashes.put(ientry.urlHash(), u);
filteredCount++;
}
// check constraints
if (!testFlags(ientry)) continue loop;

// increase flag counts
for (int i = 0; i < 32; i++) {
if (ientry.flags().get(i)) {flagcount[i]++;}
}

// load url
if (sortorder == 0) {
this.sortedRWIEntries.put(ientry.urlHash(), ientry);
this.urlhashes.put(ientry.urlHash(), ientry.urlHash());
filteredCount++;
} else {
if (local) {
uentry = wordIndex.loadedURL.load(ientry.urlHash(), ientry, 0);
if (uentry == null) {
this.misses.add(ientry.urlHash());
} else {
u = uentry.comp().url().toNormalform(false, true);
this.sortedRWIEntries.put(u, ientry);
this.urlhashes.put(ientry.urlHash(), u);
filteredCount++;
}
} else {
filteredCount++;
}
// interrupt if we have enough
if ((query.neededResults() > 0) && (this.misses.size() + this.sortedRWIEntries.size() > query.neededResults())) break loop;
} // end loop
}
}

// interrupt if we have enough
if ((query.neededResults() > 0) && (this.misses.size() + this.sortedRWIEntries.size() > query.neededResults())) break loop;
} // end loop
}

public void insert(indexContainer container, boolean local) {
public void insertRanked(indexContainer index, boolean local) {
// we collect the urlhashes and construct a list with urlEntry objects
// attention: if minEntries is too high, this method will not terminate within the maxTime

assert (container != null);
if (container.size() == 0) return;
assert (index != null);
if (index.size() == 0) return;

long timer = System.currentTimeMillis();
if (this.order == null) {
this.order = new indexRWIEntryOrder(query.ranking);
}
this.order.extend(container);
serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(query.id(true), plasmaSearchEvent.NORMALIZING, container.size(), System.currentTimeMillis() - timer));

/*
container.setOrdering(o, 0);
container.sort();
*/
this.order.normalizeWith(index);
serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(query.id(true), plasmaSearchEvent.NORMALIZING, index.size(), System.currentTimeMillis() - timer));

// normalize entries and get ranking
timer = System.currentTimeMillis();
Iterator<indexRWIRowEntry> i = container.entries();
indexRWIEntry iEntry, l;
Iterator<indexRWIRowEntry> i = index.entries();
indexRWIVarEntry iEntry, l;
long biggestEntry = 0;
//long s0 = System.currentTimeMillis();
Long r;
while (i.hasNext()) {
iEntry = (indexRWIEntry) i.next();
if (iEntry.urlHash().length() != container.row().primaryKeyLength) continue;
iEntry = new indexRWIVarEntry(i.next());
if (iEntry.urlHash().length() != index.row().primaryKeyLength) continue;

// increase flag counts
for (int j = 0; j < 32; j++) {
Expand Down Expand Up @@ -216,11 +216,11 @@ public void insert(indexContainer container, boolean local) {
continue;
} else {
if (urlhashes.containsKey(iEntry.urlHash())) continue;
l = (indexRWIEntry) sortedRWIEntries.remove((Long) sortedRWIEntries.lastKey());
l = (indexRWIVarEntry) sortedRWIEntries.remove((Long) sortedRWIEntries.lastKey());
urlhashes.remove(l.urlHash());
while (sortedRWIEntries.containsKey(r)) r = new Long(r.longValue() + 1);
sortedRWIEntries.put(r, iEntry);
biggestEntry = order.cardinal((indexRWIEntry) sortedRWIEntries.get(sortedRWIEntries.lastKey()));
biggestEntry = order.cardinal((indexRWIVarEntry) sortedRWIEntries.get(sortedRWIEntries.lastKey()));
}
}

Expand All @@ -232,7 +232,7 @@ public void insert(indexContainer container, boolean local) {
//System.out.println("###DEBUG### time to sort " + container.size() + " entries to " + this.filteredCount + ": " + sc + " milliseconds, " + (container.size() / sc) + " entries/millisecond, ranking = " + tc);

//if ((query.neededResults() > 0) && (container.size() > query.neededResults())) remove(true, true);
serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(query.id(true), plasmaSearchEvent.PRESORT, container.size(), System.currentTimeMillis() - timer));
serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(query.id(true), plasmaSearchEvent.PRESORT, index.size(), System.currentTimeMillis() - timer));
}

private boolean testFlags(indexRWIEntry ientry) {
Expand Down

0 comments on commit 974fea7

Please sign in to comment.