Skip to content

Commit

Permalink
enhanced re-crawl settings
Browse files Browse the repository at this point in the history
git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1960 6c8d7289-2bf4-0310-a012-ef5d649a1542
  • Loading branch information
orbiter committed Mar 24, 2006
1 parent 708cc6c commit 0c9b618
Show file tree
Hide file tree
Showing 4 changed files with 63 additions and 17 deletions.
13 changes: 11 additions & 2 deletions htroot/IndexCreate_p.html
Expand Up @@ -45,9 +45,18 @@ <h2>Index Creation</h2>
</tr>
<tr valign="top" class="TableCellLight">
<td class=small>Re-Crawl Option:</td>
<td class=small><input name="crawlingIfOlder" type="text" size="7" maxlength="7" value="#[crawlingIfOlder]#"></td>
<td class=small>

<input type="checkbox" name="crawlingIfOlderCheck" align="top" #(crawlingIfOlderCheck)#::checked#(/crawlingIfOlderCheck)#>
<input name="crawlingIfOlderNumber" type="text" size="7" maxlength="7" value="#[crawlingIfOlderNumber]#"></td>
Year(s)&nbsp;<input type="radio" name="crawlingIfOlderUnit" value="year" #(crawlingIfOlderUnitYearCheck)#::checked#(/crawlingIfOlderYearUnitCheck)#>
Month(s)&nbsp;<input type="radio" name="crawlingIfOlderUnit" value="month" #(crawlingIfOlderUnitMonthCheck)#::checked#(/crawlingIfOlderMonthUnitCheck)#>
Day(s)&nbsp;<input type="radio" name="crawlingIfOlderUnit" value="day" #(crawlingIfOlderUnitDayCheck)#::checked#(/crawlingIfOlderDayUnitCheck)#>
Hour(s)&nbsp;<input type="radio" name="crawlingIfOlderUnit" value="hour" #(crawlingIfOlderUnitHourCheck)#::checked#(/crawlingIfOlderHourUnitCheck)#>
Minute(s)&nbsp;<input type="radio" name="crawlingIfOlderUnit" value="minute" #(crawlingIfOlderUnitMinuteCheck)#::checked#(/crawlingIfOlderMinuteUnitCheck)#>
<td class=small>
If you use this option, web pages that are already existent in your database are crawled and indexed again.
It depends on the age of the last crawl if this is done or not: if the last crawl is older than the given
date, the page is crawled again, othervise it is treaded as 'double' and not loaded or indexed again.
</td>
</tr>
<tr valign="top" class="TableCellDark">
Expand Down
49 changes: 43 additions & 6 deletions htroot/IndexCreate_p.java
Expand Up @@ -95,8 +95,11 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
env.setConfig("crawlingFilter", newcrawlingfilter);
int newcrawlingdepth = Integer.parseInt(post.get("crawlingDepth", "0"));
env.setConfig("crawlingDepth", Integer.toString(newcrawlingdepth));
int recrawlIfOlder = Integer.parseInt(post.get("crawlingIfOlder", "-1"));
env.setConfig("crawlingIfOlder", recrawlIfOlder);
boolean crawlingIfOlderCheck = post.get("crawlingIfOlderCheck", "").equals("on");
int crawlingIfOlderNumber = Integer.parseInt(post.get("crawlingIfOlderNumber", "-1"));
String crawlingIfOlderUnit = post.get("crawlingIfOlderUnit","year");
int crawlingIfOlder = recrawlIfOlderC(crawlingIfOlderCheck, crawlingIfOlderNumber, crawlingIfOlderUnit);
env.setConfig("crawlingIfOlder", crawlingIfOlder);
int domFilterDepth = Integer.parseInt(post.get("crawlingDomFilterDepth", "-1"));
env.setConfig("crawlingDomFilterDepth", Integer.toString(domFilterDepth));
int domMaxPages = Integer.parseInt(post.get("crawlingDomMaxPages", "-1"));
Expand Down Expand Up @@ -151,7 +154,7 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
switchboard.urlPool.errorURL.remove(urlhash);

// stack url
plasmaCrawlProfile.entry pe = switchboard.profiles.newEntry(crawlingStartURL.getHost(), crawlingStart, newcrawlingfilter, newcrawlingfilter, newcrawlingdepth, newcrawlingdepth, recrawlIfOlder, domFilterDepth, domMaxPages, crawlingQ, storeHTCache, true, localIndexing, crawlOrder, xsstopw, xdstopw, xpstopw);
plasmaCrawlProfile.entry pe = switchboard.profiles.newEntry(crawlingStartURL.getHost(), crawlingStart, newcrawlingfilter, newcrawlingfilter, newcrawlingdepth, newcrawlingdepth, crawlingIfOlder, domFilterDepth, domMaxPages, crawlingQ, storeHTCache, true, localIndexing, crawlOrder, xsstopw, xdstopw, xpstopw);
String reasonString = switchboard.sbStackCrawlThread.stackCrawl(crawlingStart, null, yacyCore.seedDB.mySeed.hash, "CRAWLING-ROOT", new Date(), 0, pe);

if (reasonString == null) {
Expand Down Expand Up @@ -212,7 +215,7 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
HashMap hyperlinks = (HashMap) scraper.getAnchors();

// creating a crawler profile
plasmaCrawlProfile.entry profile = switchboard.profiles.newEntry(fileName, file.toURL().toString(), newcrawlingfilter, newcrawlingfilter, newcrawlingdepth, newcrawlingdepth, recrawlIfOlder, domFilterDepth, domMaxPages, crawlingQ, storeHTCache, true, localIndexing, crawlOrder, xsstopw, xdstopw, xpstopw);
plasmaCrawlProfile.entry profile = switchboard.profiles.newEntry(fileName, file.toURL().toString(), newcrawlingfilter, newcrawlingfilter, newcrawlingdepth, newcrawlingdepth, crawlingIfOlder, domFilterDepth, domMaxPages, crawlingQ, storeHTCache, true, localIndexing, crawlOrder, xsstopw, xdstopw, xpstopw);

// loop through the contained links
Iterator interator = hyperlinks.entrySet().iterator();
Expand Down Expand Up @@ -301,7 +304,32 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
prop.put("proxyPrefetchDepth", env.getConfig("proxyPrefetchDepth", "0"));
prop.put("crawlingDepth", env.getConfig("crawlingDepth", "0"));
prop.put("crawlingFilter", env.getConfig("crawlingFilter", "0"));
prop.put("crawlingIfOlder", env.getConfig("crawlingIfOlder", "-1"));

int crawlingIfOlder = (int) env.getConfigLong("crawlingIfOlder", -1);
prop.put("crawlingIfOlderCheck", (crawlingIfOlder == Integer.MAX_VALUE) ? 0 : 1);
prop.put("crawlingIfOlderUnitYearCheck", 0);
prop.put("crawlingIfOlderUnitMonthCheck", 0);
prop.put("crawlingIfOlderUnitDayCheck", 0);
prop.put("crawlingIfOlderUnitHourCheck", 0);
prop.put("crawlingIfOlderUnitMinuteCheck", 0);
if (crawlingIfOlder == Integer.MAX_VALUE) {
} else if (crawlingIfOlder >= 60*24*365) {
prop.put("crawlingIfOlderNumber", crawlingIfOlder / 60*24*365);
prop.put("crawlingIfOlderUnitYearCheck", 1);
} else if (crawlingIfOlder >= 60*24*30) {
prop.put("crawlingIfOlderNumber", crawlingIfOlder / 60*24*30);
prop.put("crawlingIfOlderUnitMonthCheck", 1);
} else if (crawlingIfOlder >= 60*24) {
prop.put("crawlingIfOlderNumber", crawlingIfOlder / 60*24);
prop.put("crawlingIfOlderUnitDayCheck", 1);
} else if (crawlingIfOlder >= 60) {
prop.put("crawlingIfOlderNumber", crawlingIfOlder / 60);
prop.put("crawlingIfOlderUnitHourCheck", 1);
} else {
prop.put("crawlingIfOlderNumber", crawlingIfOlder);
prop.put("crawlingIfOlderUnitMinuteCheck", 1);
}
//prop.put("crawlingIfOlder", crawlingIfOlder);
prop.put("crawlingDomFilterDepth", env.getConfig("crawlingDomFilterDepth", "-1"));
prop.put("crawlingDomMaxPages", env.getConfig("crawlingDomMaxPages", "-1"));
prop.put("crawlingQChecked", env.getConfig("crawlingQ", "").equals("true") ? 1 : 0);
Expand Down Expand Up @@ -476,7 +504,16 @@ record = yacyCore.newsPool.get(yacyNewsPool.PROCESSED_DB, c);
// return rewrite properties
return prop;
}


private static int recrawlIfOlderC(boolean recrawlIfOlderCheck, int recrawlIfOlderNumber, String crawlingIfOlderUnit) {
if (!recrawlIfOlderCheck) return -1;
if (crawlingIfOlderUnit.equals("year")) return recrawlIfOlderNumber * 60 * 24 * 356;
if (crawlingIfOlderUnit.equals("month")) return recrawlIfOlderNumber * 60 * 24 * 30;
if (crawlingIfOlderUnit.equals("day")) return recrawlIfOlderNumber * 60 * 24;
if (crawlingIfOlderUnit.equals("hour")) return recrawlIfOlderNumber * 60;
if (crawlingIfOlderUnit.equals("minute")) return recrawlIfOlderNumber;
return -1;
}
}


Expand Down
12 changes: 6 additions & 6 deletions htroot/env/templates/header.template
Expand Up @@ -37,13 +37,7 @@
<tr><td class="MenuItem">&nbsp;<img border="0" src="/env/grafics/lock.gif" align="top">&nbsp;<a href="/IndexCreate_p.html" class="MenuItemLink">Index Create</a></td></tr>
<tr><td class="MenuItem">&nbsp;<img border="0" src="/env/grafics/lock.gif" align="top">&nbsp;<a href="/IndexControl_p.html" class="MenuItemLink">Index Control</a></td></tr>
<tr><td class="MenuItem">&nbsp;<a href="/IndexMonitor.html" class="MenuItemLink">Index Monitor</a></td></tr>
<tr><td class="MenuSpacer"></td></tr>

<tr><td class="MenuHeader">&nbsp;Local Proxy</td></tr>
<tr><td class="MenuItem">&nbsp;<img border="0" src="/env/grafics/lock.gif" align="top">&nbsp;<a href="/Blacklist_p.html" class="MenuItemLink">Blacklist</a></td></tr>
<tr><td class="MenuItem">&nbsp;<img border="0" src="/env/grafics/lock.gif" align="top">&nbsp;<a href="/ProxyIndexingMonitor_p.html" class="MenuItemLink">Proxy Indexing</a></td></tr>
<tr><td class="MenuItem">&nbsp;<img border="0" src="/env/grafics/lock.gif" align="top">&nbsp;<a href="/CacheAdmin_p.html" class="MenuItemLink">Cache Monitor</a></td></tr>
<tr><td class="MenuItem">&nbsp;<img border="0" src="/env/grafics/lock.gif" align="top">&nbsp;<a href="/CookieMonitorIncoming_p.html" class="MenuItemLink">Cookie Monitor</a></td></tr>
<tr><td class="MenuSpacer"></td></tr>

<tr><td class="MenuHeader">&nbsp;Communication / Publication</td></tr>
Expand All @@ -65,6 +59,12 @@
<tr><td class="MenuItem">&nbsp;<img border="0" src="/env/grafics/lock.gif" align="top">&nbsp;<a href="/Connections_p.html" class="MenuItemLink">Connections</a></td></tr>
<tr><td class="MenuSpacer"></td></tr>

<tr><td class="MenuHeader">&nbsp;Local Proxy</td></tr>
<tr><td class="MenuItem">&nbsp;<img border="0" src="/env/grafics/lock.gif" align="top">&nbsp;<a href="/ProxyIndexingMonitor_p.html" class="MenuItemLink">Proxy Indexing</a></td></tr>
<tr><td class="MenuItem">&nbsp;<img border="0" src="/env/grafics/lock.gif" align="top">&nbsp;<a href="/CacheAdmin_p.html" class="MenuItemLink">Cache Monitor</a></td></tr>
<tr><td class="MenuItem">&nbsp;<img border="0" src="/env/grafics/lock.gif" align="top">&nbsp;<a href="/CookieMonitorIncoming_p.html" class="MenuItemLink">Cookie Monitor</a></td></tr>
<tr><td class="MenuSpacer"></td></tr>

<tr><td class="MenuHeader">&nbsp;The Project</td></tr>
<tr><td class="MenuItem">&nbsp;<a href="http://www.yacy.net/yacy/" class="MenuItemLink">Project Home</a></td></tr>
<tr><td class="MenuItem">&nbsp;<a href="http://www.yacy.net/yacy/News.html" class="MenuItemLink">Project News</a></td></tr>
Expand Down
6 changes: 3 additions & 3 deletions source/yacy.java
Expand Up @@ -138,7 +138,7 @@ public final class yacy {
private static float version = (float) 0.1;

private static final String vDATE = "@REPL_DATE@";
private static final String copyright = "[ YACY Proxy v" + vString + ", build " + vDATE + " by Michael Christen / www.yacy.net ]";
private static final String copyright = "[ YaCy v" + vString + ", build " + vDATE + " by Michael Christen / www.yacy.net ]";
private static final String hline = "-------------------------------------------------------------------------------";

/**
Expand All @@ -163,10 +163,10 @@ public static String combinedVersionString2PrettyString(String s) {
}

/**
* Combines the version of the proxy with the versionnumber from SVN to a
* Combines the version of YaCy with the versionnumber from SVN to a
* combined version
*
* @param version Current given version for this proxy.
* @param version Current given version.
* @param svn Current version given from svn.
* @return String with the combined version
*/
Expand Down

0 comments on commit 0c9b618

Please sign in to comment.