Skip to content

Commit

Permalink
addendum to last commit
Browse files Browse the repository at this point in the history
moved recrawl times for standard profiles to constants
calculate new specific dates in cleanup job

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5082 6c8d7289-2bf4-0310-a012-ef5d649a1542
  • Loading branch information
lotus committed Aug 26, 2008
1 parent 480497f commit d9d9c52
Show file tree
Hide file tree
Showing 4 changed files with 44 additions and 7 deletions.
3 changes: 2 additions & 1 deletion htroot/CrawlProfileEditor_p.java
Expand Up @@ -25,6 +25,7 @@
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Iterator;

Expand Down Expand Up @@ -215,7 +216,7 @@ private static void putProfileEntry(final servletProperties prop, final CrawlPro
prop.put("crawlProfiles_" + count + "_handle", profile.handle());
prop.put("crawlProfiles_" + count + "_depth", profile.generalDepth());
prop.put("crawlProfiles_" + count + "_filter", profile.generalFilter());
prop.put("crawlProfiles_" + count + "_crawlingIfOlder", (profile.recrawlIfOlder() == Long.MAX_VALUE) ? "no re-crawl" : ""+profile.recrawlIfOlder());
prop.put("crawlProfiles_" + count + "_crawlingIfOlder", (profile.recrawlIfOlder() == 0L) ? "no re-crawl" : ""+ SimpleDateFormat.getDateTimeInstance().format(profile.recrawlIfOlder()));
prop.put("crawlProfiles_" + count + "_crawlingDomFilterDepth", (profile.domFilterDepth() == Integer.MAX_VALUE) ? "inactive" : Integer.toString(profile.domFilterDepth()));

// start contrib [MN]
Expand Down
6 changes: 5 additions & 1 deletion source/de/anomic/crawler/CrawlProfile.java
@@ -1,4 +1,4 @@
// plasmaCrawlProfile.java
// CrawlProfile.java
// ------------------------
// part of YaCy
// (C) by Michael Peter Christen; mc@yacy.net
Expand Down Expand Up @@ -199,6 +199,10 @@ public void changeEntry(final entry e, final String propName, final String newVa
profileTable.put(e.handle(), e.mem);
}

public long getRecrawlDate(final long oldTimeMinutes) {
return System.currentTimeMillis() - (60000L * oldTimeMinutes);
}

public static class DomProfile {

public String referrer;
Expand Down
26 changes: 26 additions & 0 deletions source/de/anomic/plasma/plasmaSwitchboard.java
Expand Up @@ -122,6 +122,7 @@ this class is also the core of the http crawling.
import de.anomic.crawler.ResultURLs;
import de.anomic.crawler.RobotsTxt;
import de.anomic.crawler.ZURL;
import de.anomic.crawler.CrawlProfile.entry;
import de.anomic.data.URLLicense;
import de.anomic.data.blogBoard;
import de.anomic.data.blogBoardComments;
Expand Down Expand Up @@ -1295,6 +1296,31 @@ public boolean cleanupJob() {
setConfig("adminAccount", "");
}

// refresh recrawl dates
try{
Iterator<CrawlProfile.entry> it = webIndex.profilesActiveCrawls.profiles(true);
entry selentry;
while (it.hasNext()) {
selentry = it.next();
if (selentry.name().equals(plasmaWordIndex.CRAWL_PROFILE_PROXY))
webIndex.profilesActiveCrawls.changeEntry(selentry, CrawlProfile.entry.RECRAWL_IF_OLDER,
Long.toString(webIndex.profilesActiveCrawls.getRecrawlDate(plasmaWordIndex.CRAWL_PROFILE_PROXY_RECRAWL_CYCLE)));
// if (selentry.name().equals(plasmaWordIndex.CRAWL_PROFILE_REMOTE));
if (selentry.name().equals(plasmaWordIndex.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT))
webIndex.profilesActiveCrawls.changeEntry(selentry, CrawlProfile.entry.RECRAWL_IF_OLDER,
Long.toString(webIndex.profilesActiveCrawls.getRecrawlDate(plasmaWordIndex.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE)));
if (selentry.name().equals(plasmaWordIndex.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT))
webIndex.profilesActiveCrawls.changeEntry(selentry, CrawlProfile.entry.RECRAWL_IF_OLDER,
Long.toString(webIndex.profilesActiveCrawls.getRecrawlDate(plasmaWordIndex.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE)));
if (selentry.name().equals(plasmaWordIndex.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA))
webIndex.profilesActiveCrawls.changeEntry(selentry, CrawlProfile.entry.RECRAWL_IF_OLDER,
Long.toString(webIndex.profilesActiveCrawls.getRecrawlDate(plasmaWordIndex.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE)));
if (selentry.name().equals(plasmaWordIndex.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA))
webIndex.profilesActiveCrawls.changeEntry(selentry, CrawlProfile.entry.RECRAWL_IF_OLDER,
Long.toString(webIndex.profilesActiveCrawls.getRecrawlDate(plasmaWordIndex.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE)));
}
} catch (final IOException e) {};

// close unused connections
JakartaCommonsHttpClient.cleanup();

Expand Down
16 changes: 11 additions & 5 deletions source/de/anomic/plasma/plasmaWordIndex.java
Expand Up @@ -90,6 +90,12 @@ public final class plasmaWordIndex implements indexRI {
public static final String DBFILE_ACTIVE_CRAWL_PROFILES = "crawlProfilesActive.db";
public static final String DBFILE_PASSIVE_CRAWL_PROFILES = "crawlProfilesPassive.db";

public static final long CRAWL_PROFILE_PROXY_RECRAWL_CYCLE = 60L * 24L;
public static final long CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE = 60L * 24L * 30L;
public static final long CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE = 60L * 24L * 30L;
public static final long CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE = 60L * 24L * 30L;
public static final long CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE = 60L * 24L * 30L;


private final kelondroByteOrder indexOrder = kelondroBase64Order.enhancedCoder;
private final indexRAMRI dhtOutCache, dhtInCache;
Expand Down Expand Up @@ -249,7 +255,7 @@ private void initActiveCrawlProfiles() {
this.defaultProxyProfile = this.profilesActiveCrawls.newEntry("proxy", null, ".*", ".*",
0 /*Integer.parseInt(getConfig(PROXY_PREFETCH_DEPTH, "0"))*/,
0 /*Integer.parseInt(getConfig(PROXY_PREFETCH_DEPTH, "0"))*/,
60 * 24, -1, -1, false,
this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), -1, -1, false,
true /*getConfigBool(PROXY_INDEXING_LOCAL_TEXT, true)*/,
true /*getConfigBool(PROXY_INDEXING_LOCAL_MEDIA, true)*/,
true, true,
Expand All @@ -263,22 +269,22 @@ private void initActiveCrawlProfiles() {
if (this.defaultTextSnippetLocalProfile == null) {
// generate new default entry for snippet fetch and optional crawling
defaultTextSnippetLocalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, ".*", ".*", 0, 0,
60 * 24 * 30, -1, -1, true, false, false, false, false, false, true, true, false);
this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, -1, true, false, false, false, false, false, true, true, false);
}
if (this.defaultTextSnippetGlobalProfile == null) {
// generate new default entry for snippet fetch and optional crawling
defaultTextSnippetGlobalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, ".*", ".*", 0, 0,
60 * 24 * 30, -1, -1, true, true, true, true, true, false, true, true, false);
this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, -1, true, true, true, true, true, false, true, true, false);
}
if (this.defaultMediaSnippetLocalProfile == null) {
// generate new default entry for snippet fetch and optional crawling
defaultMediaSnippetLocalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, ".*", ".*", 0, 0,
60 * 24 * 30, -1, -1, true, false, false, false, false, false, true, true, false);
this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, false, false, false, false, true, true, false);
}
if (this.defaultMediaSnippetGlobalProfile == null) {
// generate new default entry for snippet fetch and optional crawling
defaultMediaSnippetGlobalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, ".*", ".*", 0, 0,
60 * 24 * 30, -1, -1, true, false, true, true, true, false, true, true, false);
this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, true, true, true, false, true, true, false);
}
}

Expand Down

0 comments on commit d9d9c52

Please sign in to comment.