Skip to content

Commit

Permalink
redesigned NURL-handling:
Browse files Browse the repository at this point in the history
- the general NURL-index for all crawl stack types was splitted into separate indexes for these stacks
- the new NURL-index is managed by the crawl balancer
- the crawl balancer does not need an internal index any more, it is replaced by the NURL-index
- the NURL.Entry was generalized and is now a new class plasmaCrawlEntry
- the new class plasmaCrawlEntry replaces also the preNURL.Entry class, and will also replace the switchboardEntry class in the future
- the new class plasmaCrawlEntry is more accurate for date entries (holds milliseconds) and can contain larger 'name' entries (anchor tag names)
- the EURL object was replaced by a new ZURL object, which is a container for the plasmaCrawlEntry and some tracking information
- the EURL index is now filled with ZURL objects
- a new index delegatedURL holds ZURL objects about plasmaCrawlEntry obects to track which url is handed over to other peers
- redesigned handling of plasmaCrawlEntry - handover, because there is no need any more to convert one entry object into another
- found and fixed numerous bugs in the context of crawl state handling
- fixed a serious bug in kelondroCache which caused that entries could not be removed
- fixed some bugs in online interface and adopted monitor output to new entry objects
- adopted yacy protocol to handle new delegatedURL entries
all old crawl queues will disappear after this update!

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3483 6c8d7289-2bf4-0310-a012-ef5d649a1542
  • Loading branch information
orbiter committed Mar 16, 2007
1 parent 094a148 commit 861f41e
Show file tree
Hide file tree
Showing 31 changed files with 1,086 additions and 1,408 deletions.
3 changes: 2 additions & 1 deletion htroot/CrawlURLFetchStack_p.java
Expand Up @@ -56,6 +56,7 @@
import de.anomic.htmlFilter.htmlFilterWriter;
import de.anomic.http.httpHeader;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlEntry;
import de.anomic.plasma.plasmaCrawlNURL;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.urlPattern.plasmaURLPattern;
Expand Down Expand Up @@ -272,7 +273,7 @@ private static boolean addURL(String url, boolean blCheck, URLFetcherStack stack
}

private static int shiftFromNotice(plasmaCrawlNURL nurl, int fromStackType, URLFetcherStack stack, int count) {
plasmaCrawlNURL.Entry entry;
plasmaCrawlEntry entry;
int failed = 0;
for (int i=0; i<count; i++) try {
entry = nurl.pop(fromStackType);
Expand Down
12 changes: 3 additions & 9 deletions htroot/CrawlURLFetch_p.java
Expand Up @@ -49,10 +49,9 @@
import java.util.Random;
import java.util.TreeMap;

import de.anomic.kelondro.kelondroBitfield;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlEURL;
import de.anomic.plasma.plasmaCrawlProfile;
import de.anomic.plasma.plasmaCrawlZURL;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverSwitch;
import de.anomic.http.httpHeader;
Expand Down Expand Up @@ -499,14 +498,9 @@ private int stackURLs(String[] urls) throws InterruptedException {
totalFailed++;
this.failed.put(urls[i], reason);
try {
plasmaCrawlEURL.Entry ee = this.sb.errorURL.newEntry(
plasmaCrawlZURL.Entry ee = this.sb.errorURL.newEntry(
new URL(urls[i]),
null,
yacyCore.seedDB.mySeed.hash,
yacyCore.seedDB.mySeed.hash,
"",
reason,
new kelondroBitfield());
reason);
ee.store();
this.sb.errorURL.stackPushEntry(ee);
} catch (MalformedURLException e) { }
Expand Down
6 changes: 3 additions & 3 deletions htroot/IndexCreateIndexingQueue_p.java
Expand Up @@ -50,7 +50,7 @@
import de.anomic.data.wikiCode;
import de.anomic.http.httpHeader;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlEURL;
import de.anomic.plasma.plasmaCrawlZURL;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaSwitchboardQueue;
import de.anomic.server.serverObjects;
Expand Down Expand Up @@ -186,7 +186,7 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
dark = true;
URL url;
String initiatorHash, executorHash;
plasmaCrawlEURL.Entry entry;
plasmaCrawlZURL.Entry entry;
yacySeed initiatorSeed, executorSeed;
int j=0;
for (int i = switchboard.errorURL.stackSize() - 1; i >= (switchboard.errorURL.stackSize() - showRejectedCount); i--) {
Expand All @@ -202,7 +202,7 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
prop.put("rejected_list_"+j+"_initiator", ((initiatorSeed == null) ? "proxy" : wikiCode.replaceHTML(initiatorSeed.getName())));
prop.put("rejected_list_"+j+"_executor", ((executorSeed == null) ? "proxy" : wikiCode.replaceHTML(executorSeed.getName())));
prop.put("rejected_list_"+j+"_url", wikiCode.replaceHTML(url.toString()));
prop.put("rejected_list_"+j+"_failreason", entry.failreason());
prop.put("rejected_list_"+j+"_failreason", entry.anycause());
prop.put("rejected_list_"+j+"_dark", ((dark) ? 1 : 0));
dark = !dark;
j++;
Expand Down
5 changes: 3 additions & 2 deletions htroot/IndexCreateWWWGlobalQueue_p.java
Expand Up @@ -49,6 +49,7 @@

import de.anomic.data.wikiCode;
import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaCrawlEntry;
import de.anomic.plasma.plasmaCrawlNURL;
import de.anomic.plasma.plasmaCrawlProfile;
import de.anomic.plasma.plasmaSwitchboard;
Expand Down Expand Up @@ -99,9 +100,9 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
prop.put("crawler-queue", 0);
} else {
prop.put("crawler-queue", 1);
plasmaCrawlNURL.Entry[] crawlerList = switchboard.noticeURL.top(plasmaCrawlNURL.STACK_TYPE_LIMIT, showLimit);
plasmaCrawlEntry[] crawlerList = switchboard.noticeURL.top(plasmaCrawlNURL.STACK_TYPE_LIMIT, showLimit);
prop.put("crawler-queue_num", stackSize);//num Entries
plasmaCrawlNURL.Entry urle;
plasmaCrawlEntry urle;
boolean dark = true;
yacySeed initiator;
String profileHandle;
Expand Down
19 changes: 7 additions & 12 deletions htroot/IndexCreateWWWLocalQueue_p.java
Expand Up @@ -43,7 +43,6 @@
// javac -classpath .:../classes IndexCreate_p.java
// if the shell's current path is HTROOT

import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Iterator;
Expand All @@ -54,10 +53,10 @@

import de.anomic.data.wikiCode;
import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaCrawlEntry;
import de.anomic.plasma.plasmaCrawlNURL;
import de.anomic.plasma.plasmaCrawlProfile;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaCrawlNURL.Entry;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.yacy.yacyCore;
Expand Down Expand Up @@ -101,15 +100,11 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve

// iterating through the list of URLs
Iterator iter = switchboard.noticeURL.iterator(plasmaCrawlNURL.STACK_TYPE_CORE);
plasmaCrawlEntry entry;
while (iter.hasNext()) {
entry = (plasmaCrawlEntry) iter.next();
String value = null;
String nextHash = (String) iter.next();
Entry entry = null;
try {
entry = switchboard.noticeURL.getEntry(nextHash);
} catch (IOException e) {
continue;
}
String nextHash = entry.urlhash();
if ((option.equals("URL")&&(entry.url() != null))) {
value = entry.url().toString();
} else if ((option.equals("AnchorName"))) {
Expand Down Expand Up @@ -162,9 +157,9 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
prop.put("crawler-queue", 0);
} else {
prop.put("crawler-queue", 1);
plasmaCrawlNURL.Entry[] crawlerList = switchboard.noticeURL.top(plasmaCrawlNURL.STACK_TYPE_CORE, (int) (showLimit * 1.20));
plasmaCrawlEntry[] crawlerList = switchboard.noticeURL.top(plasmaCrawlNURL.STACK_TYPE_CORE, (int) (showLimit * 1.20));

plasmaCrawlNURL.Entry urle;
plasmaCrawlEntry urle;
boolean dark = true;
yacySeed initiator;
String profileHandle;
Expand All @@ -183,7 +178,7 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
prop.put("crawler-queue_list_"+showNum+"_modified", daydate(urle.loaddate()) );
prop.put("crawler-queue_list_"+showNum+"_anchor", wikiCode.replaceHTML(urle.name()));
prop.put("crawler-queue_list_"+showNum+"_url", wikiCode.replaceHTML(urle.url().toString()));
prop.put("crawler-queue_list_"+showNum+"_hash", urle.hash());
prop.put("crawler-queue_list_"+showNum+"_hash", urle.urlhash());
dark = !dark;
showNum++;
} else {
Expand Down
20 changes: 10 additions & 10 deletions htroot/IndexCreate_p.html
Expand Up @@ -27,11 +27,6 @@ <h2>Index Creation</h2>
<td>Starting Point:</td>
<td>
<table cellpadding="0" cellspacing="0">
<tr>
<td>From&nbsp;File:</td>
<td><input type="radio" name="crawlingMode" value="file" /></td>
<td><input type="file" name="crawlingFile" size="28" /></td>
</tr>
<tr>
<td>From&nbsp;URL:</td>
<td><input type="radio" name="crawlingMode" value="url" checked="checked" /></td>
Expand All @@ -41,7 +36,12 @@ <h2>Index Creation</h2>
</td>
</tr>
<tr>
<td colspan="2"><span id="title"></span></td>
<td>From&nbsp;File:</td>
<td><input type="radio" name="crawlingMode" value="file" /></td>
<td><input type="file" name="crawlingFile" size="28" /></td>
</tr>
<tr>
<td colspan="3" class="commit"><span id="title"><br></span></td>
</tr>
</table>
</td>
Expand Down Expand Up @@ -125,7 +125,7 @@ <h2>Index Creation</h2>
</td>
</tr>
<tr valign="top" class="TableCellLight">
<td>Store to Proxy Cache:</td>
<td>Store to Web Cache:</td>
<td><input type="checkbox" name="storeHTCache" #(storeHTCacheChecked)#::checked="checked"#(/storeHTCacheChecked)# /></td>
<td>
This option is used by default for proxy prefetch, but is not needed for explicit crawling.
Expand Down Expand Up @@ -194,9 +194,9 @@ <h2>Index Creation</h2>
<tr valign="top" class="TableCellLight">
<td>Wanted Performance:</td>
<td>
<input type="radio" name="crawlingSpeed" value="maximum" #(crawlingSpeedMaxChecked)#::checked="checked"#(/crawlingSpeedMaxChecked)# />maximum&nbsp;&nbsp;
<input type="radio" name="crawlingSpeed" value="custom" #(crawlingSpeedCustChecked)#::checked="checked"#(/crawlingSpeedCustChecked)# />custom: <input name="customPPM" type="text" size="4" maxlength="4" value="#[customPPMdefault]#" />PPM&nbsp;&nbsp;
<input type="radio" name="crawlingSpeed" value="minimum" #(crawlingSpeedMinChecked)#::checked="checked"#(/crawlingSpeedMinChecked)# />optimal as background process
<input type="radio" name="crawlingPerformance" value="maximum" #(crawlingSpeedMaxChecked)#::checked="checked"#(/crawlingSpeedMaxChecked)# />maximum&nbsp;&nbsp;
<input type="radio" name="crawlingPerformance" value="custom" #(crawlingSpeedCustChecked)#::checked="checked"#(/crawlingSpeedCustChecked)# />custom: <input name="customPPM" type="text" size="4" maxlength="4" value="#[customPPMdefault]#" />PPM&nbsp;&nbsp;
<input type="radio" name="crawlingPerformance" value="minimum" #(crawlingSpeedMinChecked)#::checked="checked"#(/crawlingSpeedMinChecked)# />optimal as background process
</td>
<td colspan="3">
Set wanted level of computing power, used for this and other running crawl tasks. (PPM = pages per minute)
Expand Down
1 change: 0 additions & 1 deletion htroot/Messages_p.java
Expand Up @@ -53,7 +53,6 @@
import java.util.TreeMap;

import de.anomic.data.messageBoard;
import de.anomic.data.wikiCode;
import de.anomic.http.httpHeader;
import de.anomic.http.httpc;
import de.anomic.plasma.plasmaSwitchboard;
Expand Down
14 changes: 6 additions & 8 deletions htroot/WatchCrawler_p.java
Expand Up @@ -39,10 +39,9 @@
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.htmlFilter.htmlFilterWriter;
import de.anomic.http.httpHeader;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlEURL;
import de.anomic.plasma.plasmaCrawlProfile;
import de.anomic.plasma.plasmaCrawlZURL;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaURL;
import de.anomic.server.serverFileUtils;
Expand Down Expand Up @@ -222,8 +221,7 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
prop.put("info_crawlingURL", wikiCode.replaceHTML(((String) post.get("crawlingURL"))));
prop.put("info_reasonString", reasonString);

plasmaCrawlEURL.Entry ee = switchboard.errorURL.newEntry(crawlingStartURL, null, yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash,
crawlingStartURL.getHost(), reasonString, new kelondroBitfield());
plasmaCrawlZURL.Entry ee = switchboard.errorURL.newEntry(crawlingStartURL, reasonString);
ee.store();
switchboard.errorURL.stackPushEntry(ee);
}
Expand Down Expand Up @@ -300,8 +298,7 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
if (rejectReason == null) {
c++;
} else {
plasmaCrawlEURL.Entry ee = switchboard.errorURL.newEntry(nexturlURL, null, yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash,
(String) e.getValue(), rejectReason, new kelondroBitfield());
plasmaCrawlZURL.Entry ee = switchboard.errorURL.newEntry(nexturlURL, rejectReason);
ee.store();
switchboard.errorURL.stackPushEntry(ee);
}
Expand Down Expand Up @@ -401,9 +398,10 @@ private static int recrawlIfOlderC(boolean recrawlIfOlderCheck, int recrawlIfOld

private static void setPerformance(plasmaSwitchboard sb, serverObjects post) {
String crawlingPerformance = post.get("crawlingPerformance","custom");
int wantedPPM = 1000;
long LCbusySleep = Integer.parseInt(sb.getConfig(plasmaSwitchboard.CRAWLJOB_LOCAL_CRAWL_BUSYSLEEP, "100"));
int wantedPPM = (int) (60000L / LCbusySleep);
try {
wantedPPM = Integer.parseInt(post.get("customPPM","1000"));
wantedPPM = Integer.parseInt(post.get("customPPM",Integer.toString(wantedPPM)));
} catch (NumberFormatException e) {}
if (crawlingPerformance.equals("minimum")) wantedPPM = 10;
if (crawlingPerformance.equals("maximum")) wantedPPM = 1000;
Expand Down
7 changes: 4 additions & 3 deletions htroot/xml/queues_p.java
Expand Up @@ -54,6 +54,7 @@

import de.anomic.data.wikiCode;
import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaCrawlEntry;
import de.anomic.plasma.plasmaCrawlLoaderMessage;
import de.anomic.plasma.plasmaCrawlNURL;
import de.anomic.plasma.plasmaSwitchboard;
Expand Down Expand Up @@ -183,10 +184,10 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
}


public static final void addNTable(serverObjects prop, String tableName, plasmaCrawlNURL.Entry[] crawlerList) {
public static final void addNTable(serverObjects prop, String tableName, plasmaCrawlEntry[] crawlerList) {

int showNum = 0;
plasmaCrawlNURL.Entry urle;
plasmaCrawlEntry urle;
yacySeed initiator;
for (int i = 0; i < crawlerList.length; i++) {
urle = crawlerList[i];
Expand All @@ -198,7 +199,7 @@ public static final void addNTable(serverObjects prop, String tableName, plasmaC
prop.put(tableName + "_" + showNum + "_modified", daydate(urle.loaddate()));
prop.putSafeXML(tableName + "_" + showNum + "_anchor", urle.name());
prop.putSafeXML(tableName + "_" + showNum + "_url", urle.url().toString());
prop.put(tableName + "_" + showNum + "_hash", urle.hash());
prop.put(tableName + "_" + showNum + "_hash", urle.urlhash());
showNum++;
}
}
Expand Down
2 changes: 1 addition & 1 deletion htroot/xml/snippet.java
Expand Up @@ -66,7 +66,7 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
prop.putASIS("text", (snippet.exists()) ? snippet.getLineMarked(queryHashes) : "unknown"); //FIXME: the ASIS should not be needed, but we have still htmlcode in .java files
} else {
// problems with snippet fetch
prop.put("text", (remove) ? switchboard.snippetCache.failConsequences(snippet, query) : snippet.getError());
prop.put("text", (remove) ? switchboard.snippetCache.failConsequences(snippet, queryHashes) : snippet.getError());
}
prop.put("link", 0);
prop.put("links", 0);
Expand Down

0 comments on commit 861f41e

Please sign in to comment.