Skip to content

Commit

Permalink
several bugfixes
Browse files Browse the repository at this point in the history
git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1971 6c8d7289-2bf4-0310-a012-ef5d649a1542
  • Loading branch information
orbiter committed Mar 27, 2006
1 parent 57fc0cf commit 7a650d0
Show file tree
Hide file tree
Showing 5 changed files with 32 additions and 13 deletions.
2 changes: 1 addition & 1 deletion htroot/IndexCreate_p.java
Expand Up @@ -392,7 +392,7 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
prop.put("crawlProfiles_"+count+"_filter", profile.generalFilter());
prop.put("crawlProfiles_"+count+"_crawlingIfOlder", (profile.recrawlIfOlder() == Long.MAX_VALUE) ? "no re-crawl" : ""+profile.recrawlIfOlder());
prop.put("crawlProfiles_"+count+"_crawlingDomFilterDepth", (profile.domFilterDepth() == Integer.MAX_VALUE) ? "inactive" : ""+profile.domFilterDepth());
prop.put("crawlProfiles_"+count+"_crawlingDomFilterContent", profile.domNames(true));
prop.put("crawlProfiles_"+count+"_crawlingDomFilterContent", profile.domNames(true, 160));
prop.put("crawlProfiles_"+count+"_crawlingDomMaxPages", (profile.domMaxPages() == Integer.MAX_VALUE) ? "unlimited" : ""+profile.domMaxPages());
prop.put("crawlProfiles_"+count+"_withQuery", ((profile.crawlingQ()) ? 1 : 0));
prop.put("crawlProfiles_"+count+"_storeCache", ((profile.storeHTCache()) ? 1 : 0));
Expand Down
3 changes: 2 additions & 1 deletion source/de/anomic/plasma/plasmaCrawlLURL.java
Expand Up @@ -416,10 +416,11 @@ public class Entry {
// - author / copyright owner
// - keywords
// - phrasecount, total number of phrases
// - boolean: URL attributes
// - boolean: URL attributes (see Word-Entity definition)
// - boolean: appearance of bold and/or italics
// - int: # of outlinks to same domain
// - int: # of outlinks to outside domain
// - ETag: for re-crawl decision upon HEAD request

public Entry(URL url, String descr, Date moddate, Date loaddate, String referrerHash, int copyCount, boolean localNeed, int quality, String language, char doctype, int size, int wordCount) {
// create new entry and store it into database
Expand Down
24 changes: 20 additions & 4 deletions source/de/anomic/plasma/plasmaCrawlProfile.java
Expand Up @@ -414,12 +414,24 @@ public void domInc(String domain, String referrer, int depth) {
}
domsCache.put(this.mem.get("handle"), doms);
}
public int domCount(String domain) {
public boolean grantedDomAppearance(String domain) {
int max = domFilterDepth();
if (max == Integer.MAX_VALUE) return true;
DomProfile dp = (DomProfile) doms.get(domain);
if (dp == null) {
return 0;
return 0 < max;
} else {
return dp.count < max;
}
}
public boolean grantedDomCount(String domain) {
int max = domMaxPages();
if (max == Integer.MAX_VALUE) return true;
DomProfile dp = (DomProfile) doms.get(domain);
if (dp == null) {
return 0 < max;
} else {
return dp.count;
return dp.count < max;
}
}
public int domSize() {
Expand All @@ -429,7 +441,7 @@ public boolean domExists(String domain) {
if (domFilterDepth() == Integer.MAX_VALUE) return true;
return doms.containsKey(domain);
}
public String domNames(boolean attr) {
public String domNames(boolean attr, int maxlength) {
Iterator domnamesi = doms.entrySet().iterator();
String domnames="";
Map.Entry ey;
Expand All @@ -438,6 +450,10 @@ public String domNames(boolean attr) {
ey = (Map.Entry) domnamesi.next();
dp = (DomProfile) ey.getValue();
domnames += ((String) ey.getKey()) + ((attr) ? ("/d=" + dp.depth + ",c=" + dp.count + " ") : " ");
if ((maxlength > 0) && (domnames.length() >= maxlength)) {
domnames = domnames.substring(0, maxlength-3) + "...";
break;
}
}
return domnames;
}
Expand Down
4 changes: 2 additions & 2 deletions source/de/anomic/plasma/plasmaCrawlStacker.java
Expand Up @@ -323,15 +323,15 @@ public String stackCrawl(String nexturlString, String referrerString, String ini
}

// deny urls that do not match with the profile domain list
if (profile.domCount(nexturl.getHost()) == 0) {
if (!(profile.grantedDomAppearance(nexturl.getHost()))) {
reason = "denied_(no_match_with_domain_filter)";
this.log.logFine("URL '" + nexturlString + "' is not listed in granted domains. " +
"Stack processing time: " + (System.currentTimeMillis()-startTime));
return reason;
}

// deny urls that exceed allowed number of occurrences
if (profile.domCount(nexturl.getHost()) > profile.domMaxPages()) {
if (!(profile.grantedDomCount(nexturl.getHost()))) {
reason = "denied_(domain_count_exceeded)";
this.log.logFine("URL '" + nexturlString + "' appeared too often, a maximum of " + profile.domMaxPages() + " is allowed. "+
"Stack processing time: " + (System.currentTimeMillis()-startTime));
Expand Down
12 changes: 7 additions & 5 deletions source/de/anomic/plasma/plasmaWordIndexAssortmentCluster.java
Expand Up @@ -172,11 +172,13 @@ public plasmaWordIndexEntryContainer storeTry(String wordHash, plasmaWordIndexEn
int need = newContainer.size();
int selectedAssortment = testsize - 1;
while (selectedAssortment >= 0) {
spaces[selectedAssortment] = (assortments[selectedAssortment].get(wordHash) == null) ? (selectedAssortment + 1) : 0;
need -= spaces[selectedAssortment];
assert (need >= 0);
if (need == 0) break;
selectedAssortment = (need < selectedAssortment) ? need : selectedAssortment - 1;
if (selectedAssortment + 1 <= need) {
spaces[selectedAssortment] = (assortments[selectedAssortment].get(wordHash) == null) ? (selectedAssortment + 1) : 0;
need -= spaces[selectedAssortment];
assert (need >= 0);
if (need == 0) break;
}
selectedAssortment--;
}
if (need == 0) {
// we found spaces so that we can put in the newContainer into these spaces
Expand Down

0 comments on commit 7a650d0

Please sign in to comment.