Skip to content
Permalink
Browse files

New optional crawl filter on the URL a doc must match to crawl its links

For finer control over which parsed documents can trigger an addition of
their links to the crawl stack, complementary to the existing crawl
depth parameter.
  • Loading branch information...
luccioman committed May 1, 2019
1 parent 8d3e029 commit 6b45cd579922574059e5385153b84be3ca07533b
@@ -23,6 +23,8 @@
<crawlerAlwaysCheckMediaType>#(crawlerAlwaysCheckMediaType)#false::true#(/crawlerAlwaysCheckMediaType)#</crawlerAlwaysCheckMediaType>
<crawlerURLMustMatch>#[crawlerURLMustMatch]#</crawlerURLMustMatch>
<crawlerURLMustNotMatch>#[crawlerURLMustNotMatch]#</crawlerURLMustNotMatch>
<crawlerOriginURLMustMatch>#[crawlerOriginURLMustMatch]#</crawlerOriginURLMustMatch>
<crawlerOriginURLMustNotMatch>#[crawlerOriginURLMustNotMatch]#</crawlerOriginURLMustNotMatch>
<crawlerIPMustMatch>#[crawlerIPMustMatch]#</crawlerIPMustMatch>
<crawlerIPMustNotMatch>#[crawlerIPMustNotMatch]#</crawlerIPMustNotMatch>
<crawlerCountryMustMatch>#[crawlerCountryMustMatch]#</crawlerCountryMustMatch>
@@ -174,6 +174,7 @@
// remove if MATCH_NEVER_STRING
disableIf('mustnotmatch', defaultMatchNone);
disableIf('crawlerOriginURLMustNotMatch', defaultMatchNone);
disableIf('ipMustnotmatch', defaultMatchNone);
disableIf('indexmustnotmatch', defaultMatchNone);
disableIf('indexcontentmustnotmatch', defaultMatchNone);
@@ -183,6 +184,7 @@
// remove if MATCH_ALL_STRING
disableIf('mustmatch', defaultMatchAll);
disableIf('crawlerOriginURLMustMatch', defaultMatchAll);
disableIf('ipMustmatch', defaultMatchAll);
disableIf('indexmustmatch', defaultMatchAll);
disableIf('indexcontentmustmatch', defaultMatchAll);
@@ -354,6 +356,29 @@ <h2>Expert Crawl Start</h2>
<tr><td><img src="env/grafics/minus.gif" alt=""> must-not-match</td><td><input name="mustnotmatch" id="mustnotmatch" type="text" size="55" maxlength="100000" value="#[mustnotmatch]#" /></td></tr>
</table>
</dd>

<dt>Load Filter on URL origin of links</dt>
<dd>
<span class="info" style="float:right">
<img src="env/grafics/i16.gif" width="16" height="16" alt="info"/>
<span style="right:0px;">
The filter is a <b><a href="https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html" target="_blank">regular expression</a></b>.
Example: to allow loading only links from pages on example.org domain, set the must-match filter to '.*example.org.*'.
Attention: you can test the functionality of your regular expressions using the <a href="RegexTest.html">Regular Expression Tester</a> within YaCy.
</span>
</span>
<table style="border-width: 0px">
<tr>
<td style="width:110px"><img src="env/grafics/plus.gif" alt=""> must-match</td>
<td><input name="crawlerOriginURLMustMatch" id="crawlerOriginURLMustMatch" type="text" size="55" maxlength="100000" value="#[crawlerOriginURLMustMatch]#" onblur="if (this.value=='') this.value='.*';"/> (must not be empty)</td>
</tr>
<tr>
<td><img src="env/grafics/minus.gif" alt=""> must-not-match</td>
<td><input name="crawlerOriginURLMustNotMatch" id="crawlerOriginURLMustNotMatch" type="text" size="55" maxlength="100000" value="#[crawlerOriginURLMustNotMatch]#" /></td>
</tr>
</table>
</dd>

<dt>Load Filter on IPs</dt>
<dd>
<table style="border-width: 0px">
@@ -248,6 +248,22 @@ public static serverObjects respond(@SuppressWarnings("unused") final RequestHea
} else {
prop.put("mustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
}

// Filter on URL origin of links: must match
if (post != null && post.containsKey(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key)) {
prop.put(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key,
post.get(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key, CrawlProfile.MATCH_ALL_STRING));
} else {
prop.put(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key, CrawlProfile.MATCH_ALL_STRING);
}

// Filter on URL origin of links: must-not-match
if (post != null && post.containsKey(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTNOTMATCH.key)) {
prop.put(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTNOTMATCH.key,
post.get(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTNOTMATCH.key, CrawlProfile.MATCH_NEVER_STRING));
} else {
prop.put(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTNOTMATCH.key, CrawlProfile.MATCH_NEVER_STRING);
}

// Load Filter on IPs: must match
if (post != null && post.containsKey("ipMustmatch")) {
@@ -626,7 +626,11 @@ public static serverObjects respond(final RequestHeader header, final serverObje
ignoreclassname,
new VocabularyScraper(vocabulary_scraper),
timezoneOffset);


profile.put(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key,
post.get(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key, CrawlProfile.MATCH_ALL_STRING));
profile.put(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTNOTMATCH.key, post
.get(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTNOTMATCH.key, CrawlProfile.MATCH_NEVER_STRING));
profile.put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key,
post.get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key, CrawlProfile.MATCH_ALL_STRING));
profile.put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, post
@@ -99,6 +99,8 @@
CRAWLER_ALWAYS_CHECK_MEDIA_TYPE("crawlerAlwaysCheckMediaType", false, CrawlAttribute.BOOLEAN, "Always cross check file extension against actual Media Type"),
CRAWLER_URL_MUSTMATCH ("crawlerURLMustMatch", false, CrawlAttribute.STRING, "URL Must-Match Filter"),
CRAWLER_URL_MUSTNOTMATCH ("crawlerURLMustNotMatch", false, CrawlAttribute.STRING, "URL Must-Not-Match Filter"),
CRAWLER_ORIGIN_URL_MUSTMATCH ("crawlerOriginURLMustMatch", false, CrawlAttribute.STRING, "Links Origin URL Must-Match Filter"),
CRAWLER_ORIGIN_URL_MUSTNOTMATCH ("crawlerOriginURLMustNotMatch", false, CrawlAttribute.STRING, "Links Origin URL Must-Not-Match Filter"),
CRAWLER_IP_MUSTMATCH ("crawlerIPMustMatch", false, CrawlAttribute.STRING, "IP Must-Match Filter"),
CRAWLER_IP_MUSTNOTMATCH ("crawlerIPMustNotMatch", false, CrawlAttribute.STRING, "IP Must-Not-Match Filter"),
CRAWLER_COUNTRY_MUSTMATCH ("crawlerCountryMustMatch", false, CrawlAttribute.STRING, "Country Must-Match Filter"),
@@ -148,6 +150,13 @@ public String toString() {


private Pattern crawlerurlmustmatch = null, crawlerurlmustnotmatch = null;

/** Pattern on the URL a document must match to allow adding its embedded links to the crawl stack */
private Pattern crawlerOriginUrlMustMatch = null;

/** Pattern on the URL a document must not match to allow adding its embedded links to the crawl stack */
private Pattern crawlerOriginUrlMustNotMatch = null;

private Pattern crawleripmustmatch = null, crawleripmustnotmatch = null;
private Pattern crawlernodepthlimitmatch = null;
private Pattern indexurlmustmatch = null, indexurlmustnotmatch = null;
@@ -243,6 +252,8 @@ public CrawlProfile(
put(CrawlAttribute.CRAWLER_ALWAYS_CHECK_MEDIA_TYPE.key, true);
put(CrawlAttribute.CRAWLER_URL_MUSTMATCH.key, (crawlerUrlMustMatch == null) ? CrawlProfile.MATCH_ALL_STRING : crawlerUrlMustMatch);
put(CrawlAttribute.CRAWLER_URL_MUSTNOTMATCH.key, (crawlerUrlMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerUrlMustNotMatch);
put(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key, (crawlerUrlMustMatch == null) ? CrawlProfile.MATCH_ALL_STRING : crawlerUrlMustMatch);
put(CrawlAttribute.CRAWLER_URL_MUSTNOTMATCH.key, (crawlerUrlMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerUrlMustNotMatch);
put(CrawlAttribute.CRAWLER_IP_MUSTMATCH.key, (crawlerIpMustMatch == null) ? CrawlProfile.MATCH_ALL_STRING : crawlerIpMustMatch);
put(CrawlAttribute.CRAWLER_IP_MUSTNOTMATCH.key, (crawlerIpMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerIpMustNotMatch);
put(CrawlAttribute.CRAWLER_COUNTRY_MUSTMATCH.key, (crawlerCountryMustMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerCountryMustMatch);
@@ -501,6 +512,50 @@ public Pattern urlMustNotMatchPattern() {
}
return this.crawlerurlmustnotmatch;
}

/**
* Get the pattern on the URL a document must match to allow adding its embedded links to the crawl stack
*
* @return a {@link Pattern} instance, defaulting to
* {@link CrawlProfile#MATCH_ALL_PATTERN} when the regular expression
* string is not set or its syntax is incorrect
*/
public Pattern getCrawlerOriginUrlMustMatchPattern() {
if (this.crawlerOriginUrlMustMatch == null) {
/* Cache the compiled pattern for faster next calls */
final String patternStr = get(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key);
try {
this.crawlerOriginUrlMustMatch = (patternStr == null
|| patternStr.equals(CrawlProfile.MATCH_ALL_STRING)) ? CrawlProfile.MATCH_ALL_PATTERN
: Pattern.compile(patternStr, Pattern.CASE_INSENSITIVE);
} catch (final PatternSyntaxException e) {
this.crawlerOriginUrlMustMatch = CrawlProfile.MATCH_ALL_PATTERN;
}
}
return this.crawlerOriginUrlMustMatch;
}

/**
* Get the pattern on the URL a document must not match to allow adding its embedded links to the crawl stack
*
* @return a {@link Pattern} instance, defaulting to
* {@link CrawlProfile#MATCH_NEVER_PATTERN} when the regular expression
* string is not set or its syntax is incorrect
*/
public Pattern getCrawlerOriginUrlMustNotMatchPattern() {
if (this.crawlerOriginUrlMustNotMatch == null) {
/* Cache the compiled pattern for faster next calls */
final String patternStr = get(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTNOTMATCH.key);
try {
this.crawlerOriginUrlMustNotMatch = (patternStr == null
|| patternStr.equals(CrawlProfile.MATCH_NEVER_STRING)) ? CrawlProfile.MATCH_NEVER_PATTERN
: Pattern.compile(patternStr, Pattern.CASE_INSENSITIVE);
} catch (final PatternSyntaxException e) {
this.crawlerOriginUrlMustNotMatch = CrawlProfile.MATCH_NEVER_PATTERN;
}
}
return this.crawlerOriginUrlMustNotMatch;
}

/**
* Gets the regex which must be matched by IPs in order to be crawled.
@@ -926,6 +981,8 @@ public void putProfileEntry(
prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlerAlwaysCheckMediaType", this.isCrawlerAlwaysCheckMediaType());
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerURLMustMatch", this.get(CrawlAttribute.CRAWLER_URL_MUSTMATCH.key));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerURLMustNotMatch", this.get(CrawlAttribute.CRAWLER_URL_MUSTNOTMATCH.key));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerOriginURLMustMatch", this.get(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerOriginURLMustNotMatch", this.get(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTNOTMATCH.key));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerIPMustMatch", this.get(CrawlAttribute.CRAWLER_IP_MUSTMATCH.key));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerIPMustNotMatch", this.get(CrawlAttribute.CRAWLER_IP_MUSTNOTMATCH.key));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerCountryMustMatch", this.get(CrawlAttribute.CRAWLER_COUNTRY_MUSTMATCH.key));

0 comments on commit 6b45cd5

Please sign in to comment.
You can’t perform that action at this time.