Skip to content

Commit

Permalink
enhanced search result computation
Browse files Browse the repository at this point in the history
git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2527 6c8d7289-2bf4-0310-a012-ef5d649a1542
  • Loading branch information
orbiter committed Sep 8, 2006
1 parent 809960d commit 03835c2
Show file tree
Hide file tree
Showing 5 changed files with 19 additions and 18 deletions.
2 changes: 2 additions & 0 deletions source/de/anomic/htmlFilter/htmlFilterContentScraper.java
Expand Up @@ -154,6 +154,8 @@ public static String urlNormalform(URL baseURL, String us) {
*/
public static final String splitrex = " |/|\\(|\\)|-|\\:|_|\\.|,|\\?|!|'|" + '"';
public static String[] urlComps(String normalizedURL) {
int p = normalizedURL.indexOf("//");
if (p > 0) normalizedURL = normalizedURL.substring(p + 2);
return normalizedURL.toLowerCase().split(splitrex); // word components of the url
}

Expand Down
13 changes: 5 additions & 8 deletions source/de/anomic/plasma/plasmaSearchEvent.java
Expand Up @@ -266,8 +266,8 @@ public plasmaSearchResult order(indexContainer rcLocal) {

// apply filter
profileLocal.startTimer();
//acc.removeRedundant();
acc.removeDoubleDom();
acc.removeRedundant();
//acc.removeDoubleDom();
profileLocal.setYieldTime(plasmaSearchTimingProfile.PROCESS_FILTER);
profileLocal.setYieldCount(plasmaSearchTimingProfile.PROCESS_FILTER, acc.sizeOrdered());

Expand All @@ -294,12 +294,9 @@ private plasmaSearchResult orderLocal(indexContainer rcLocal, long maxtime) {
plasmaCrawlLURL.Entry page;
Long preranking;
Object[] preorderEntry;
int minEntries = profileLocal.getTargetCount(plasmaSearchTimingProfile.PROCESS_POSTSORT);
try {
while (preorder.hasNext()) {
//if ((acc.sizeFetched() >= 50) && ((acc.sizeFetched() >= minEntries) || (System.currentTimeMillis() >= postorderLimitTime))) break;
//if (acc.sizeFetched() >= minEntries) break;
if ((System.currentTimeMillis() >= postorderLimitTime) && (acc.sizeFetched() >= minEntries)) break;
if (System.currentTimeMillis() >= postorderLimitTime) break;
preorderEntry = preorder.next();
entry = (indexEntry) preorderEntry[0];
preranking = (Long) preorderEntry[1];
Expand All @@ -322,8 +319,8 @@ private plasmaSearchResult orderLocal(indexContainer rcLocal, long maxtime) {

// apply filter
profileLocal.startTimer();
//acc.removeRedundant();
acc.removeDoubleDom();
acc.removeRedundant();
//acc.removeDoubleDom();
profileLocal.setYieldTime(plasmaSearchTimingProfile.PROCESS_FILTER);
profileLocal.setYieldCount(plasmaSearchTimingProfile.PROCESS_FILTER, acc.sizeOrdered());

Expand Down
4 changes: 2 additions & 2 deletions source/de/anomic/plasma/plasmaSearchPreOrder.java
Expand Up @@ -96,7 +96,7 @@ public plasmaSearchPreOrder(plasmaSearchQuery query, plasmaSearchRankingProfile
this.pageAcc = new TreeMap();
for (int j = 0; j < count; j++) {
iEntry = (indexEntry) i.next();
pageAcc.put(serverCodings.encodeHex(this.ranking.preRanking(iEntry.generateNormalized(this.entryMin, this.entryMax), query.words("")), 16) + iEntry.urlHash(), iEntry);
pageAcc.put(serverCodings.encodeHex(Long.MAX_VALUE - this.ranking.preRanking(iEntry.generateNormalized(this.entryMin, this.entryMax), query.words("")), 16) + iEntry.urlHash(), iEntry);
}
}

Expand Down Expand Up @@ -150,7 +150,7 @@ public boolean hasNext() {
}

public Object[] /*{indexEntry, Long}*/ next() {
String top = (String) pageAcc.lastKey();
String top = (String) pageAcc.firstKey();
//System.out.println("preorder-key: " + top);
Long preranking = new Long(Long.parseLong(top.substring(0, 16), 16));
return new Object[]{(indexEntry) pageAcc.remove(top), preranking};
Expand Down
14 changes: 8 additions & 6 deletions source/de/anomic/plasma/plasmaSearchRankingProfile.java
Expand Up @@ -170,16 +170,18 @@ public long preRanking(indexEntry normalizedEntry, String searchedWord) {
ranking += normalizedEntry.quality() << ((Integer) coeff.get(ENTROPY)).intValue();
ranking += normalizedEntry.virtualAge() << ((Integer) coeff.get(DATE)).intValue();
ranking += plasmaSearchPreOrder.ybr_p(normalizedEntry.urlHash()) << ((Integer) coeff.get(YBR)).intValue();
ranking += (normalizedEntry.posintext() == 0) ? 0 : (255 - normalizedEntry.posintext()) << ((Integer) coeff.get(POSINTEXT)).intValue();
ranking += (normalizedEntry.worddistance() == 0) ? 0 : (255 - normalizedEntry.worddistance()) << ((Integer) coeff.get(WORDDISTANCE)).intValue();
ranking += (normalizedEntry.posintext() == 0) ? 0 : (256 - normalizedEntry.posintext()) << ((Integer) coeff.get(POSINTEXT)).intValue();
ranking += (normalizedEntry.worddistance() == 0) ? 0 : (256 - normalizedEntry.worddistance()) << ((Integer) coeff.get(WORDDISTANCE)).intValue();
ranking += (normalizedEntry.hitcount() == 0) ? 0 : normalizedEntry.hitcount() << ((Integer) coeff.get(HITCOUNT)).intValue();
ranking += (255 - indexURL.domLengthNormalized(normalizedEntry.urlHash())) << ((Integer) coeff.get(DOMLENGTH)).intValue();
ranking += (256 - indexURL.domLengthNormalized(normalizedEntry.urlHash())) << ((Integer) coeff.get(DOMLENGTH)).intValue();
ranking += (indexURL.probablyRootURL(normalizedEntry.urlHash())) ? 16 << ((Integer) coeff.get(URLLENGTH)).intValue() : 0;
ranking += (indexURL.probablyWordURL(normalizedEntry.urlHash(), searchedWord)) ? 256 << ((Integer) coeff.get(QUERYINURL)).intValue() : 0;
/*
if (indexURL.probablyWordURL(normalizedEntry.urlHash(), searchedWord))
System.out.println("DEBUG - hash " + normalizedEntry.urlHash() + " contains word " + searchedWord + ", weighted " + ((Integer) coeff.get(QUERYINURL)).intValue() + ", ranking = " + ranking);
else
System.out.println("DEBUG - hash " + normalizedEntry.urlHash() + " contains not word " + searchedWord + ", ranking = " + ranking);
*/
return ranking;
}

Expand Down Expand Up @@ -219,11 +221,11 @@ public long postRanking(

// prefer short urls
ranking += (256 - page.url().toString().length()) << ((Integer) coeff.get(URLLENGTH)).intValue();
ranking += (32 - urlcomps.length) << ((Integer) coeff.get(URLCOMPS)).intValue();
ranking += (8 * Math.max(0, 32 - urlcomps.length)) << ((Integer) coeff.get(URLCOMPS)).intValue();

// prefer long descriptions
ranking += (255 * page.descr().length() / 80) << ((Integer) coeff.get(DESCRLENGTH)).intValue();
ranking += (255 * (12 - Math.abs(12 - Math.min(12, descrcomps.length))) / 12) << ((Integer) coeff.get(DESCRCOMPS)).intValue();
ranking += (256 * page.descr().length() / 80) << ((Integer) coeff.get(DESCRLENGTH)).intValue();
ranking += (256 * (12 - Math.abs(12 - Math.min(12, descrcomps.length))) / 12) << ((Integer) coeff.get(DESCRCOMPS)).intValue();

return ranking;
}
Expand Down
4 changes: 2 additions & 2 deletions source/de/anomic/plasma/plasmaSearchResult.java
Expand Up @@ -100,7 +100,7 @@ public boolean hasMoreElements() {
}

public plasmaCrawlLURL.Entry nextElement() {
Object top = pageAcc.lastKey();
Object top = pageAcc.firstKey();
//System.out.println("postorder-key: " + ((String) top));
return (plasmaCrawlLURL.Entry) pageAcc.remove(top);
}
Expand Down Expand Up @@ -154,7 +154,7 @@ protected void sortResults(boolean postsort) {

// insert value
//System.out.println("Ranking " + ranking + ", YBR-" + plasmaSearchPreOrder.ybr(indexEntry.getUrlHash()) + " for URL " + page.url());
pageAcc.put(serverCodings.encodeHex(ranking, 16) + page.hash(), page);
pageAcc.put(serverCodings.encodeHex(Long.MAX_VALUE - ranking, 16) + page.hash(), page);
}

// flush memory
Expand Down

0 comments on commit 03835c2

Please sign in to comment.