Skip to content

Commit

Permalink
enhancement to crawling and remote crawling:
Browse files Browse the repository at this point in the history
- for redirector and  remote crawling place crawling url on notice queue instead of direct enqueueing in crawler queue
- when a request to a remote crawl provider fails, remove the peer from the network to prevent that the url fetcher gets stuck another time again

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5320 6c8d7289-2bf4-0310-a012-ef5d649a1542
  • Loading branch information
orbiter committed Nov 6, 2008
1 parent 3f746be commit 1b18d4b
Show file tree
Hide file tree
Showing 4 changed files with 37 additions and 44 deletions.
12 changes: 1 addition & 11 deletions htroot/rct_p.java
Expand Up @@ -76,17 +76,7 @@ public static serverObjects respond(final httpRequestHeader header, final server
if (urlRejectReason == null) {
// stack url
if (sb.getLog().isFinest()) sb.getLog().logFinest("crawlOrder: stack: url='" + url + "'");
final String reasonString = sb.crawlStacker.stackCrawl(url, referrer, peerhash, "REMOTE-CRAWLING", loaddate, 0, sb.webIndex.defaultRemoteProfile);

if (reasonString == null) {
// done
env.getLog().logInfo("crawlOrder: added remote crawl url: " + urlToString(url));
} else if (reasonString.startsWith("double")) {
// case where we have already the url loaded;
env.getLog().logInfo("crawlOrder: ignored double remote crawl url: " + urlToString(url));
} else {
env.getLog().logInfo("crawlOrder: ignored [" + reasonString + "] remote crawl url: " + urlToString(url));
}
sb.crawlStacker.enqueueEntry(url, (referrer == null) ? null : referrer.hash(), peerhash, "REMOTE-CRAWLING", loaddate, 0, sb.webIndex.defaultRemoteProfile);
} else {
env.getLog().logWarning("crawlOrder: Rejected URL '" + urlToString(url) + "': " + urlRejectReason);
}
Expand Down
20 changes: 8 additions & 12 deletions source/de/anomic/crawler/CrawlQueues.java
Expand Up @@ -362,7 +362,13 @@ public boolean remoteCrawlLoaderJob() {

// we know a peer which should provide remote crawl entries. load them now.
final RSSFeed feed = yacyClient.queryRemoteCrawlURLs(sb.webIndex.seedDB, seed, 30, 5000);
if (feed == null) return true;
if (feed == null || feed.size() == 0) {
// something is wrong with this provider. To prevent that we get not stuck with this peer
// we remove it from the peer list
sb.webIndex.peerActions.peerDeparture(seed, "no results from provided remote crawls");
return true;
}

// parse the rss
yacyURL url, referrer;
Date loaddate;
Expand All @@ -389,17 +395,7 @@ public boolean remoteCrawlLoaderJob() {
if (urlRejectReason == null) {
// stack url
if (sb.getLog().isFinest()) sb.getLog().logFinest("crawlOrder: stack: url='" + url + "'");
final String reasonString = sb.crawlStacker.stackCrawl(url, referrer, hash, item.getDescription(), loaddate, 0, sb.webIndex.defaultRemoteProfile);

if (reasonString == null) {
// done
log.logInfo("crawlOrder: added remote crawl url: " + urlToString(url));
} else if (reasonString.startsWith("double")) {
// case where we have already the url loaded;
log.logInfo("crawlOrder: ignored double remote crawl url: " + urlToString(url));
} else {
log.logInfo("crawlOrder: ignored [" + reasonString + "] remote crawl url: " + urlToString(url));
}
sb.crawlStacker.enqueueEntry(url, (referrer == null) ? null : referrer.hash(), hash, item.getDescription(), loaddate, 0, sb.webIndex.defaultRemoteProfile);
} else {
log.logWarning("crawlOrder: Rejected URL '" + urlToString(url) + "': " + urlRejectReason);
}
Expand Down
47 changes: 27 additions & 20 deletions source/de/anomic/crawler/CrawlStacker.java
Expand Up @@ -210,6 +210,33 @@ public boolean job() {
return true;
}

public String stackCrawl(
final yacyURL url,
final String referrerhash,
final String initiatorHash,
final String name,
final Date loadDate,
final int currentdepth,
final CrawlProfile.entry profile) {
// stacks a crawl item. The position can also be remote
// returns null if successful, a reason string if not successful
//this.log.logFinest("stackCrawl: nexturlString='" + nexturlString + "'");

// add the url into the crawling queue
final CrawlEntry entry = new CrawlEntry(
initiatorHash, // initiator, needed for p2p-feedback
url, // url clear text string
(referrerhash == null) ? "" : referrerhash, // last url in crawling queue
name, // load date
loadDate, // the anchor name
(profile == null) ? null : profile.handle(), // profile must not be null!
currentdepth, // depth so far
0, // anchors, default value
0 // forkfactor, default value
);
return stackCrawl(entry);
}

public void enqueueEntry(
final yacyURL nexturl,
final String referrerhash,
Expand Down Expand Up @@ -342,26 +369,6 @@ public CrawlEntry dequeueEntry() throws IOException {
return new CrawlEntry(entry);
}

public String stackCrawl(final yacyURL url, final yacyURL referrer, final String initiatorHash, final String name, final Date loadDate, final int currentdepth, final CrawlProfile.entry profile) {
// stacks a crawl item. The position can also be remote
// returns null if successful, a reason string if not successful
//this.log.logFinest("stackCrawl: nexturlString='" + nexturlString + "'");

// add the url into the crawling queue
final CrawlEntry entry = new CrawlEntry(
initiatorHash, // initiator, needed for p2p-feedback
url, // url clear text string
(referrer == null) ? "" : referrer.hash(), // last url in crawling queue
name, // load date
loadDate, // the anchor name
(profile == null) ? null : profile.handle(), // profile must not be null!
currentdepth, // depth so far
0, // anchors, default value
0 // forkfactor, default value
);
return stackCrawl(entry);
}

public String stackCrawl(final CrawlEntry entry) {
// stacks a crawl item. The position can also be remote
// returns null if successful, a reason string if not successful
Expand Down
2 changes: 1 addition & 1 deletion source/de/anomic/urlRedirector/urlRedirectord.java
Expand Up @@ -195,7 +195,7 @@ public Boolean REDIRECTOR(final String requestLine) {
sb.crawlQueues.errorURL.remove(urlhash);

// enqueuing URL for crawling
reasonString = sb.crawlStacker.stackCrawl(
sb.crawlStacker.enqueueEntry(
reqURL,
null,
sb.webIndex.seedDB.mySeed().hash,
Expand Down

0 comments on commit 1b18d4b

Please sign in to comment.