Permalink
Browse files

Added a specific default crawl profile for the recrawl job.

- with only light constraint on known indexed documents load date, as it
can already been controlled by the selection query, and the goal of the
job is indeed to recrawl selected documents now
- using the iffresh cache strategy
  • Loading branch information...
luccioman committed Jan 13, 2018
1 parent adf3fa4 commit b712a0671e8208f311b7ef2812614a0156f406ae
Showing with 41 additions and 3 deletions.
  1. +16 −2 source/net/yacy/crawler/CrawlSwitchboard.java
  2. +25 −1 source/net/yacy/crawler/RecrawlBusyThread.java
@@ -61,6 +61,7 @@
public static final String CRAWL_PROFILE_AUTOCRAWL_DEEP = "autocrawlDeep";
public static final String CRAWL_PROFILE_AUTOCRAWL_SHALLOW = "autocrawlShallow";
public static final String CRAWL_PROFILE_RECRAWL_JOB = "recrawlJob";
public static final String CRAWL_PROFILE_PROXY = "proxy";
public static final String CRAWL_PROFILE_REMOTE = "remote";
public static final String CRAWL_PROFILE_SNIPPET_LOCAL_TEXT = "snippetLocalText";
@@ -88,7 +89,13 @@
public static final String DBFILE_PASSIVE_CRAWL_PROFILES = "crawlProfilesPassive1.heap";
// Default time cycle in minutes before an indexed URL by a given crawl profile can be accepted for recrawl */
/**
* The default recrawl time cycle in minutes for recrawl jobs. The recrawl date
* limit can be set up by the recrawl job selection query, but a default limit
* prevent unwanted overload on targets)
*/
public static final long CRAWL_PROFILE_RECRAWL_JOB_RECRAWL_CYCLE = 60L; // on hour
public static final long CRAWL_PROFILE_PROXY_RECRAWL_CYCLE = 60L * 24L; // one day
public static final long CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE = 60L * 24L * 30L; // 30 days
public static final long CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE = 60L * 24L * 30L; // 30 days
@@ -104,7 +111,7 @@
private final Map<String, RowHandleSet> profilesActiveCrawlsCounter;
public CrawlProfile defaultProxyProfile, defaultRemoteProfile, defaultTextSnippetLocalProfile, defaultTextSnippetGlobalProfile;
public CrawlProfile defaultTextGreedyLearningProfile, defaultMediaSnippetLocalProfile, defaultMediaSnippetGlobalProfile, defaultSurrogateProfile;
public CrawlProfile defaultAutocrawlDeepProfile, defaultAutocrawlShallowProfile;
public CrawlProfile defaultAutocrawlDeepProfile, defaultAutocrawlShallowProfile, defaultRecrawlJobProfile;
private Map<String, CrawlProfile> defaultPushProfiles; // for each collection one profile
private final File queuesRoot;
private Switchboard switchboard;
@@ -466,6 +473,13 @@ private void initActiveCrawlProfiles() {
UTF8.getBytes(this.defaultTextSnippetGlobalProfile.handle()),
this.defaultTextSnippetGlobalProfile);
this.defaultTextSnippetGlobalProfile.setCacheStrategy(CacheStrategy.IFEXIST);
// generate new default entry for RecrawlBusyThread
this.defaultRecrawlJobProfile = RecrawlBusyThread.buildDefaultCrawlProfile();
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultRecrawlJobProfile.handle()),
this.defaultRecrawlJobProfile);
// generate new default entry for greedy learning
this.defaultTextGreedyLearningProfile =
new CrawlProfile(
@@ -34,6 +34,8 @@
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.solr.connector.SolrConnector;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.data.NoticedURL;
@@ -183,7 +185,7 @@ private boolean feedToCrawler() {
int added = 0;
if (!this.urlstack.isEmpty()) {
final CrawlProfile profile = sb.crawler.defaultTextSnippetGlobalProfile;
final CrawlProfile profile = sb.crawler.defaultRecrawlJobProfile;
for (final DigestURL url : this.urlstack) {
final Request request = sb.loader.request(url, true, true);
@@ -302,6 +304,28 @@ private boolean processSingleQuery() {
}
return true;
}
/**
* @return a new default CrawlProfile instance to be used for recrawl jobs.
*/
public static CrawlProfile buildDefaultCrawlProfile() {
CrawlProfile profile = new CrawlProfile(CrawlSwitchboard.CRAWL_PROFILE_RECRAWL_JOB, CrawlProfile.MATCH_ALL_STRING, // crawlerUrlMustMatch
CrawlProfile.MATCH_NEVER_STRING, // crawlerUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, // crawlerIpMustMatch
CrawlProfile.MATCH_NEVER_STRING, // crawlerIpMustNotMatch
CrawlProfile.MATCH_NEVER_STRING, // crawlerCountryMustMatch
CrawlProfile.MATCH_NEVER_STRING, // crawlerNoDepthLimitMatch
CrawlProfile.MATCH_ALL_STRING, // indexUrlMustMatch
CrawlProfile.MATCH_NEVER_STRING, // indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, // indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, // indexContentMustNotMatch
0, false, CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_RECRAWL_JOB_RECRAWL_CYCLE), -1,
true, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow,
true, true, true, false, -1, false, true, CrawlProfile.MATCH_NEVER_STRING, CacheStrategy.IFFRESH,
"robot_" + CrawlSwitchboard.CRAWL_PROFILE_RECRAWL_JOB,
ClientIdentification.yacyInternetCrawlerAgentName, null, null, 0);
return profile;
}
@Override
public int getJobCount() {

0 comments on commit b712a06

Please sign in to comment.