Skip to content

Commit

Permalink
enhanced input options for crawl start
Browse files Browse the repository at this point in the history
git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1978 6c8d7289-2bf4-0310-a012-ef5d649a1542
  • Loading branch information
orbiter committed Mar 28, 2006
1 parent d181d3f commit 860a7b5
Show file tree
Hide file tree
Showing 2 changed files with 50 additions and 25 deletions.
23 changes: 16 additions & 7 deletions htroot/IndexCreate_p.html
Expand Up @@ -46,8 +46,8 @@ <h2>Index Creation</h2>
<tr valign="top" class="TableCellLight">
<td class=small>Re-Crawl Option:</td>
<td class=small>
<input type="checkbox" name="crawlingIfOlderCheck" align="top" #(crawlingIfOlderCheck)#::checked#(/crawlingIfOlderCheck)#>
<input name="crawlingIfOlderNumber" type="text" size="7" maxlength="7" value="#[crawlingIfOlderNumber]#"><br>
Use:<input type="checkbox" name="crawlingIfOlderCheck" align="top" #(crawlingIfOlderCheck)#::checked#(/crawlingIfOlderCheck)#>&nbsp;&nbsp;
Time:<input name="crawlingIfOlderNumber" type="text" size="7" maxlength="7" value="#[crawlingIfOlderNumber]#"><br>
<input type="radio" name="crawlingIfOlderUnit" value="year" #(crawlingIfOlderUnitYearCheck)#::checked#(/crawlingIfOlderUnitYearCheck)#>Year(s)&nbsp;&nbsp;
<input type="radio" name="crawlingIfOlderUnit" value="month" #(crawlingIfOlderUnitMonthCheck)#::checked#(/crawlingIfOlderUnitMonthCheck)#>Month(s)&nbsp;&nbsp;
<input type="radio" name="crawlingIfOlderUnit" value="day" #(crawlingIfOlderUnitDayCheck)#::checked#(/crawlingIfOlderUnitDayCheck)#>Day(s)&nbsp;&nbsp;
Expand All @@ -60,17 +60,26 @@ <h2>Index Creation</h2>
</td>
</tr>
<tr valign="top" class="TableCellDark">
<td class=small>Auto-Dom-Filter Depth:</td>
<td class=small><input name="crawlingDomFilterDepth" type="text" size="2" maxlength="2" value="#[crawlingDomFilterDepth]#"></td>
<td class=small>Auto-Dom-Filter:</td>
<td class=small>

Use:<input type="checkbox" name="crawlingDomFilterCheck" align="top" #(crawlingDomFilterCheck)#::checked#(/crawlingDomFilterCheck)#>&nbsp;&nbsp;
Depth:<input name="crawlingDomFilterDepth" type="text" size="2" maxlength="2" value="#[crawlingDomFilterDepth]#"></td>
<td class=small>
This option will cause a creation of a domain-list during indexing. This list is filled only with domains that
appear on the given depth during crawling. The domain-list is then used to filter-out all domains, that appear
on depths greater then the given depth, but do not appear in the domain-list. You can use this option i.e.
to crawl pages with bookmarks while restricting the crawl on only those domains that appear on the bookmark-page.
</td>
</tr>
<tr valign="top" class="TableCellLight">
<td class=small>Maximum Pages per Domain:</td>
<td class=small><input name="crawlingDomMaxPages" type="text" size="6" maxlength="6" value="#[crawlingDomMaxPages]#"></td>
<td class=small>

Use:<input type="checkbox" name="crawlingDomMaxCheck" align="top" #(crawlingDomMaxCheck)#::checked#(/crawlingDomMaxCheck)#>&nbsp;&nbsp;
Page-Count:<input name="crawlingDomMaxPages" type="text" size="6" maxlength="6" value="#[crawlingDomMaxPages]#"></td>
<td class=small>
You can limit the maxmimum number of pages that are fetched and indexed from a single domain with this option.
You can combine this limitation with the 'Auto-Dom-Filter', so that the limit is applied to all the domains within
the given depth. Domains outside the given depth are then sorted-out anyway.
</td>
</tr>
<tr valign="top" class="TableCellDark">
Expand Down
52 changes: 34 additions & 18 deletions htroot/IndexCreate_p.java
Expand Up @@ -93,30 +93,43 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
// set new properties
String newcrawlingfilter = post.get("crawlingFilter", ".*");
env.setConfig("crawlingFilter", newcrawlingfilter);

int newcrawlingdepth = Integer.parseInt(post.get("crawlingDepth", "0"));
env.setConfig("crawlingDepth", Integer.toString(newcrawlingdepth));
boolean crawlingIfOlderCheck = post.get("crawlingIfOlderCheck", "").equals("on");

boolean crawlingIfOlderCheck = post.get("crawlingIfOlderCheck", "off").equals("on");
int crawlingIfOlderNumber = Integer.parseInt(post.get("crawlingIfOlderNumber", "-1"));
String crawlingIfOlderUnit = post.get("crawlingIfOlderUnit","year");
int crawlingIfOlder = recrawlIfOlderC(crawlingIfOlderCheck, crawlingIfOlderNumber, crawlingIfOlderUnit);
env.setConfig("crawlingIfOlder", crawlingIfOlder);
int domFilterDepth = Integer.parseInt(post.get("crawlingDomFilterDepth", "-1"));
env.setConfig("crawlingDomFilterDepth", Integer.toString(domFilterDepth));
int domMaxPages = Integer.parseInt(post.get("crawlingDomMaxPages", "-1"));
env.setConfig("crawlingDomMaxPages", Integer.toString(domMaxPages));
boolean crawlingQ = post.get("crawlingQ", "").equals("on");

boolean crawlingDomFilterCheck = post.get("crawlingDomFilterCheck", "off").equals("on");
int crawlingDomFilterDepth = (crawlingDomFilterCheck) ? Integer.parseInt(post.get("crawlingDomFilterDepth", "-1")) : -1;
env.setConfig("crawlingDomFilterDepth", Integer.toString(crawlingDomFilterDepth));

boolean crawlingDomMaxCheck = post.get("crawlingDomMaxCheck", "off").equals("on");
int crawlingDomMaxPages = (crawlingDomMaxCheck) ? Integer.parseInt(post.get("crawlingDomFilterDepth", "-1")) : -1;
env.setConfig("crawlingDomMaxPages", Integer.toString(crawlingDomMaxPages));

boolean crawlingQ = post.get("crawlingQ", "off").equals("on");
env.setConfig("crawlingQ", (crawlingQ) ? "true" : "false");
boolean storeHTCache = post.get("storeHTCache", "").equals("on");

boolean storeHTCache = post.get("storeHTCache", "off").equals("on");
env.setConfig("storeHTCache", (storeHTCache) ? "true" : "false");
boolean localIndexing = post.get("localIndexing", "").equals("on");

boolean localIndexing = post.get("localIndexing", "off").equals("on");
env.setConfig("localIndexing", (localIndexing) ? "true" : "false");
boolean crawlOrder = post.get("crawlOrder", "").equals("on");

boolean crawlOrder = post.get("crawlOrder", "off").equals("on");
env.setConfig("crawlOrder", (crawlOrder) ? "true" : "false");
boolean xsstopw = post.get("xsstopw", "").equals("on");

boolean xsstopw = post.get("xsstopw", "off").equals("on");
env.setConfig("xsstopw", (xsstopw) ? "true" : "false");
boolean xdstopw = post.get("xdstopw", "").equals("on");

boolean xdstopw = post.get("xdstopw", "off").equals("on");
env.setConfig("xdstopw", (xdstopw) ? "true" : "false");
boolean xpstopw = post.get("xpstopw", "").equals("on");

boolean xpstopw = post.get("xpstopw", "off").equals("on");
env.setConfig("xpstopw", (xpstopw) ? "true" : "false");

String crawlingMode = post.get("crawlingMode","url");
Expand Down Expand Up @@ -154,7 +167,7 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
switchboard.urlPool.errorURL.remove(urlhash);

// stack url
plasmaCrawlProfile.entry pe = switchboard.profiles.newEntry(crawlingStartURL.getHost(), crawlingStart, newcrawlingfilter, newcrawlingfilter, newcrawlingdepth, newcrawlingdepth, crawlingIfOlder, domFilterDepth, domMaxPages, crawlingQ, storeHTCache, true, localIndexing, crawlOrder, xsstopw, xdstopw, xpstopw);
plasmaCrawlProfile.entry pe = switchboard.profiles.newEntry(crawlingStartURL.getHost(), crawlingStart, newcrawlingfilter, newcrawlingfilter, newcrawlingdepth, newcrawlingdepth, crawlingIfOlder, crawlingDomFilterDepth, crawlingDomMaxPages, crawlingQ, storeHTCache, true, localIndexing, crawlOrder, xsstopw, xdstopw, xpstopw);
String reasonString = switchboard.sbStackCrawlThread.stackCrawl(crawlingStart, null, yacyCore.seedDB.mySeed.hash, "CRAWLING-ROOT", new Date(), 0, pe);

if (reasonString == null) {
Expand Down Expand Up @@ -215,7 +228,7 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
HashMap hyperlinks = (HashMap) scraper.getAnchors();

// creating a crawler profile
plasmaCrawlProfile.entry profile = switchboard.profiles.newEntry(fileName, file.toURL().toString(), newcrawlingfilter, newcrawlingfilter, newcrawlingdepth, newcrawlingdepth, crawlingIfOlder, domFilterDepth, domMaxPages, crawlingQ, storeHTCache, true, localIndexing, crawlOrder, xsstopw, xdstopw, xpstopw);
plasmaCrawlProfile.entry profile = switchboard.profiles.newEntry(fileName, file.toURL().toString(), newcrawlingfilter, newcrawlingfilter, newcrawlingdepth, newcrawlingdepth, crawlingIfOlder, crawlingDomFilterDepth, crawlingDomMaxPages, crawlingQ, storeHTCache, true, localIndexing, crawlOrder, xsstopw, xdstopw, xpstopw);

// loop through the contained links
Iterator interator = hyperlinks.entrySet().iterator();
Expand Down Expand Up @@ -306,7 +319,7 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
prop.put("crawlingFilter", env.getConfig("crawlingFilter", "0"));

int crawlingIfOlder = (int) env.getConfigLong("crawlingIfOlder", -1);
prop.put("crawlingIfOlderCheck", (crawlingIfOlder == Integer.MAX_VALUE) ? 0 : 1);
prop.put("crawlingIfOlderCheck", (crawlingIfOlder == -1) ? 0 : 1);
prop.put("crawlingIfOlderUnitYearCheck", 0);
prop.put("crawlingIfOlderUnitMonthCheck", 0);
prop.put("crawlingIfOlderUnitDayCheck", 0);
Expand All @@ -329,9 +342,12 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
prop.put("crawlingIfOlderNumber", crawlingIfOlder);
prop.put("crawlingIfOlderUnitMinuteCheck", 1);
}
//prop.put("crawlingIfOlder", crawlingIfOlder);
prop.put("crawlingDomFilterDepth", env.getConfig("crawlingDomFilterDepth", "-1"));
prop.put("crawlingDomMaxPages", env.getConfig("crawlingDomMaxPages", "-1"));
int crawlingDomFilterDepth = (int) env.getConfigLong("crawlingDomFilterDepth", -1);
prop.put("crawlingDomFilterCheck", (crawlingDomFilterDepth == -1) ? 0 : 1);
prop.put("crawlingDomFilterDepth", (crawlingDomFilterDepth == -1) ? 1 : crawlingDomFilterDepth);
int crawlingDomMaxPages = (int) env.getConfigLong("crawlingDomMaxPages", -1);
prop.put("crawlingDomMaxCheck", (crawlingDomMaxPages == -1) ? 0 : 1);
prop.put("crawlingDomMaxPages", (crawlingDomMaxPages == -1) ? 10000 : crawlingDomMaxPages);
prop.put("crawlingQChecked", env.getConfig("crawlingQ", "").equals("true") ? 1 : 0);
prop.put("storeHTCacheChecked", env.getConfig("storeHTCache", "").equals("true") ? 1 : 0);
prop.put("localIndexingChecked", env.getConfig("localIndexing", "").equals("true") ? 1 : 0);
Expand Down

0 comments on commit 860a7b5

Please sign in to comment.