Skip to content

Commit

Permalink
extended crawling constraints:
Browse files Browse the repository at this point in the history
- removed never-used secondary crawl depth
- added a must-not-match filter that can be used to exclude urls from a crawl
- added stub for crawl tags which will be used to identify search results that had been produced from specific crawls
please update the yacybar: replace property name 'crawlFilter' with 'mustmatch'.
Additionally, a new parameter named 'mustnotmatch' can be used, which should be by default the empty sring (match-never)

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5342 6c8d7289-2bf4-0310-a012-ef5d649a1542
  • Loading branch information
orbiter committed Nov 14, 2008
1 parent 96174b2 commit dba7ef5
Show file tree
Hide file tree
Showing 16 changed files with 193 additions and 123 deletions.
6 changes: 4 additions & 2 deletions htroot/CrawlProfileEditor_p.html
Expand Up @@ -30,7 +30,8 @@ <h2>Crawl Profile Editor</h2>
<td><strong>Status</strong></td>
<td><strong>Start URL</strong></td>
<td><strong>Depth</strong></td>
<td><strong>Filter</strong></td>
<td><strong>Must Match</strong></td>
<td><strong>Must Not Match</strong></td>
<td><strong>MaxAge</strong></td>
<td><strong>Auto Filter Depth</strong></td>
<td><strong>Auto Filter Content</strong></td>
Expand All @@ -48,7 +49,8 @@ <h2>Crawl Profile Editor</h2>
<td>#(status)#terminated::active#(/status)#</td>
<td><a href="#[startURL]#">#[startURL]#</a></td>
<td>#[depth]#</td>
<td>#[filter]#</td>
<td>#[mustmatch]#</td>
<td>#[mustnotmatch]#</td>
<td>#[crawlingIfOlder]#</td>
<td>#[crawlingDomFilterDepth]#</td>
<td>#{crawlingDomFilterContent}##[item]#<br />#{/crawlingDomFilterContent}#</td>
Expand Down
11 changes: 5 additions & 6 deletions htroot/CrawlProfileEditor_p.java
Expand Up @@ -62,10 +62,8 @@ public eentry(final String name, final String label, final boolean readonly, fin
static {
labels.add(new eentry(entry.NAME, "Name", true, eentry.STRING));
labels.add(new eentry(entry.START_URL, "Start URL", true, eentry.STRING));
labels.add(new eentry(entry.GENERAL_FILTER, "General Filter", false, eentry.STRING));
labels.add(new eentry(entry.SPECIFIC_FILTER, "Specific Filter", false, eentry.STRING));
labels.add(new eentry(entry.GENERAL_DEPTH, "General Depth", false, eentry.INTEGER));
labels.add(new eentry(entry.SPECIFIC_DEPTH, "Specific Depth", false, eentry.INTEGER));
labels.add(new eentry(entry.FILTER_MUSTMATCH, "General Filter", false, eentry.STRING));
labels.add(new eentry(entry.DEPTH, "General Depth", false, eentry.INTEGER));
labels.add(new eentry(entry.RECRAWL_IF_OLDER, "Recrawl If Older", false, eentry.INTEGER));
labels.add(new eentry(entry.DOM_FILTER_DEPTH, "Domain Filter Depth", false, eentry.INTEGER));
labels.add(new eentry(entry.DOM_MAX_PAGES, "Domain Max. Pages", false, eentry.INTEGER));
Expand Down Expand Up @@ -214,8 +212,9 @@ private static void putProfileEntry(final servletProperties prop, final CrawlPro
prop.put("crawlProfiles_" + count + "_name", profile.name());
prop.putXML("crawlProfiles_" + count + "_startURL", profile.startURL());
prop.put("crawlProfiles_" + count + "_handle", profile.handle());
prop.put("crawlProfiles_" + count + "_depth", profile.generalDepth());
prop.put("crawlProfiles_" + count + "_filter", profile.generalFilter());
prop.put("crawlProfiles_" + count + "_depth", profile.depth());
prop.put("crawlProfiles_" + count + "_mustmatch", profile.mustMatchPattern().toString());
prop.put("crawlProfiles_" + count + "_mustnotmatch", profile.mustNotMatchPattern().toString());
prop.put("crawlProfiles_" + count + "_crawlingIfOlder", (profile.recrawlIfOlder() == 0L) ? "no re-crawl" : ""+ SimpleDateFormat.getDateTimeInstance().format(profile.recrawlIfOlder()));
prop.put("crawlProfiles_" + count + "_crawlingDomFilterDepth", (profile.domFilterDepth() == Integer.MAX_VALUE) ? "inactive" : Integer.toString(profile.domFilterDepth()));

Expand Down
3 changes: 2 additions & 1 deletion htroot/CrawlProfileEditor_p.xml
Expand Up @@ -6,7 +6,8 @@
<status>#(status)#terminated::active#(/status)#</status>
<starturl>#[startURL]#</starturl>
<depth>#[depth]#</depth>
<filter>#[filter]#</filter>
<mustmatch>#[mustmatch]#</mustmatch>
<mustnotmatch>#[mustnotmatch]#</mustnotmatch>
<crawlingIfOlder>#[crawlingIfOlder]#</crawlingIfOlder>
<crawlingDomFilterDepth>#[crawlingDomFilterDepth]#</crawlingDomFilterDepth>
<crawlingDomFilterContent>
Expand Down
18 changes: 15 additions & 3 deletions htroot/CrawlStart_p.html
Expand Up @@ -100,18 +100,30 @@ <h2>Crawl Start</h2>
</td>
</tr>
<tr valign="top" class="TableCellDark">
<td><label for="crawlingFilter">Crawling Filter</label>:</td>
<td><label for="mustmatch">Must-Match Filter</label>:</td>
<td>
<input type="radio" name="range" value="wide" checked="checked" />Use filter&nbsp;&nbsp;
<input name="crawlingFilter" id="crawlingFilter" type="text" size="20" maxlength="100" value="#[crawlingFilter]#" /><br />
<input name="mustmatch" id="mustmatch" type="text" size="60" maxlength="100" value="#[mustmatch]#" /><br />
<input type="radio" name="range" value="domain" />Restrict to start domain<br />
<input type="radio" name="range" value="subpath" />Restrict to sub-path
</td>
<td>
The filter is an emacs-like regular expression that must match with the URLs which are used to be crawled; default is 'catch all'.
The filter is an emacs-like regular expression that must match with the URLs which are used to be crawled;
default is 'catch all'.
You can also use an automatic domain-restriction to fully crawl a single domain.
</td>
</tr>
<tr valign="top" class="TableCellDark">
<td><label for="mustnotmatch">Must-Not-Match Filter</label>:</td>
<td>
<input name="mustnotmatch" id="mustnotmatch" type="text" size="80" maxlength="100" value="#[mustnotmatch]#" />
</td>
<td>
This filter must not match to allow that the page is accepted for crawling.
The empty string is a never-match filter which should do well for most cases.
If you don't know what this means, please leave this field empty.
</td>
</tr>
<tr valign="top" class="TableCellLight">
<td>Re-crawl known URLs:</td>
<td>
Expand Down
4 changes: 3 additions & 1 deletion htroot/CrawlStart_p.java
Expand Up @@ -24,6 +24,7 @@
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

import de.anomic.crawler.CrawlProfile;
import de.anomic.http.httpRequestHeader;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaSwitchboardConstants;
Expand All @@ -44,7 +45,8 @@ public static serverObjects respond(final httpRequestHeader header, final server
prop.put("starturl", (intranet) ? repository : "http://");
prop.put("proxyPrefetchDepth", env.getConfig("proxyPrefetchDepth", "0"));
prop.put("crawlingDepth", env.getConfig("crawlingDepth", "0"));
prop.put("crawlingFilter", (intranet) ? repository + ".*" : ".*");
prop.put("mustmatch", (intranet) ? repository + ".*" : CrawlProfile.MATCH_ALL);
prop.put("mustnotmatch", CrawlProfile.MATCH_NEVER);

prop.put("crawlingIfOlderCheck", "0");
prop.put("crawlingIfOlderUnitYearCheck", "0");
Expand Down
13 changes: 7 additions & 6 deletions htroot/QuickCrawlLink_p.java
Expand Up @@ -91,7 +91,8 @@ public static serverObjects respond(final httpRequestHeader header, final server
final String title = post.get("title",null);

// getting other parameters if set
final String crawlingFilter = post.get("crawlingFilter", ".*");
final String crawlingMustMatch = post.get("mustmatch", CrawlProfile.MATCH_ALL);
final String crawlingMustNotMatch = post.get("mustnotmatch", CrawlProfile.MATCH_NEVER);
final int CrawlingDepth = Integer.parseInt(post.get("crawlingDepth", "0"));
final boolean crawlDynamic = post.get("crawlingQ", "").equals("on");
final boolean indexText = post.get("indexText", "on").equals("on");
Expand Down Expand Up @@ -129,11 +130,11 @@ public static serverObjects respond(final httpRequestHeader header, final server
try {
pe = sb.webIndex.profilesActiveCrawls.newEntry(
crawlingStartURL.getHost(),
crawlingStartURL,
crawlingFilter,
crawlingFilter,
CrawlingDepth,
CrawlingDepth,
crawlingStartURL,
CrawlProfile.KEYWORDS_USER,
crawlingMustMatch,
crawlingMustNotMatch,
CrawlingDepth,
60 * 24 * 30, // recrawlIfOlder (minutes); here: one month
-1, // domFilterDepth, if negative: no auto-filter
-1, // domMaxPages, if negative: no count restriction
Expand Down
52 changes: 36 additions & 16 deletions htroot/WatchCrawler_p.java
Expand Up @@ -123,16 +123,16 @@ public static serverObjects respond(final httpRequestHeader header, final server
crawlingStart = (crawlingStartURL == null) ? null : crawlingStartURL.toNormalform(true, true);

// set the crawling filter
String newcrawlingfilter = post.get("crawlingFilter", ".*");
if (newcrawlingfilter.length() < 2) newcrawlingfilter = ".*"; // avoid that all urls are filtered out if bad value was submitted

String newcrawlingMustMatch = post.get("mustmatch", CrawlProfile.MATCH_ALL);
String newcrawlingMustNotMatch = post.get("mustnotmatch", CrawlProfile.MATCH_NEVER);
if (newcrawlingMustMatch.length() < 2) newcrawlingMustMatch = CrawlProfile.MATCH_ALL; // avoid that all urls are filtered out if bad value was submitted
// special cases:
if (crawlingStartURL!= null && fullDomain) {
newcrawlingfilter = ".*" + crawlingStartURL.getHost() + ".*";
newcrawlingMustMatch = ".*" + crawlingStartURL.getHost() + ".*";
}
if (crawlingStart!= null && subPath && (pos = crawlingStart.lastIndexOf("/")) > 0) {
newcrawlingfilter = crawlingStart.substring(0, pos + 1) + ".*";
newcrawlingMustMatch = crawlingStart.substring(0, pos + 1) + ".*";
}
env.setConfig("crawlingFilter", newcrawlingfilter);

final boolean crawlOrder = post.get("crawlOrder", "off").equals("on");
env.setConfig("crawlOrder", (crawlOrder) ? "true" : "false");
Expand Down Expand Up @@ -183,12 +183,12 @@ public static serverObjects respond(final httpRequestHeader header, final server
if ((crawlingStart == null || crawlingStartURL == null) /* || (!(crawlingStart.matches(newcrawlingfilter))) */) {
// print error message
prop.put("info", "4"); //crawlfilter does not match url
prop.putHTML("info_newcrawlingfilter", newcrawlingfilter);
prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
prop.putHTML("info_crawlingStart", crawlingStart);
} else try {

// check if the crawl filter works correctly
Pattern.compile(newcrawlingfilter);
Pattern.compile(newcrawlingMustMatch);

// stack request
// first delete old entry, if exists
Expand All @@ -201,8 +201,12 @@ public static serverObjects respond(final httpRequestHeader header, final server
// stack url
sb.webIndex.profilesPassiveCrawls.removeEntry(crawlingStartURL.hash()); // if there is an old entry, delete it
final CrawlProfile.entry pe = sb.webIndex.profilesActiveCrawls.newEntry(
crawlingStartURL.getHost(), crawlingStartURL, newcrawlingfilter, newcrawlingfilter,
newcrawlingdepth, newcrawlingdepth,
crawlingStartURL.getHost(),
crawlingStartURL,
CrawlProfile.KEYWORDS_USER,
newcrawlingMustMatch,
newcrawlingMustNotMatch,
newcrawlingdepth,
crawlingIfOlder, crawlingDomFilterDepth, crawlingDomMaxPages,
crawlingQ,
indexText, indexMedia,
Expand Down Expand Up @@ -270,7 +274,7 @@ public static serverObjects respond(final httpRequestHeader header, final server
}
} catch (final PatternSyntaxException e) {
prop.put("info", "4"); //crawlfilter does not match url
prop.putHTML("info_newcrawlingfilter", newcrawlingfilter);
prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
prop.putHTML("info_error", e.getMessage());
} catch (final Exception e) {
// mist
Expand All @@ -286,7 +290,7 @@ public static serverObjects respond(final httpRequestHeader header, final server
final String fileName = post.get("crawlingFile");
try {
// check if the crawl filter works correctly
Pattern.compile(newcrawlingfilter);
Pattern.compile(newcrawlingMustMatch);

// loading the file content
final File file = new File(fileName);
Expand All @@ -306,7 +310,21 @@ public static serverObjects respond(final httpRequestHeader header, final server

// creating a crawler profile
final yacyURL crawlURL = new yacyURL("file://" + file.toString(), null);
final CrawlProfile.entry profile = sb.webIndex.profilesActiveCrawls.newEntry(fileName, crawlURL, newcrawlingfilter, newcrawlingfilter, newcrawlingdepth, newcrawlingdepth, crawlingIfOlder, crawlingDomFilterDepth, crawlingDomMaxPages, crawlingQ, indexText, indexMedia, storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw);
final CrawlProfile.entry profile = sb.webIndex.profilesActiveCrawls.newEntry(
fileName, crawlURL, CrawlProfile.KEYWORDS_USER,
newcrawlingMustMatch,
CrawlProfile.MATCH_NEVER,
newcrawlingdepth,
crawlingIfOlder,
crawlingDomFilterDepth,
crawlingDomMaxPages,
crawlingQ,
indexText,
indexMedia,
storeHTCache,
true,
crawlOrder,
xsstopw, xdstopw, xpstopw);

// pause local crawl here
sb.pauseCrawlJob(plasmaSwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
Expand All @@ -333,7 +351,7 @@ public static serverObjects respond(final httpRequestHeader header, final server
} catch (final PatternSyntaxException e) {
// print error message
prop.put("info", "4"); //crawlfilter does not match url
prop.putHTML("info_newcrawlingfilter", newcrawlingfilter);
prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
prop.putHTML("info_error", e.getMessage());
} catch (final Exception e) {
// mist
Expand All @@ -353,8 +371,10 @@ public static serverObjects respond(final httpRequestHeader header, final server

// create a new profile
final CrawlProfile.entry pe = sb.webIndex.profilesActiveCrawls.newEntry(
sitemapURLStr, sitemapURL, newcrawlingfilter, newcrawlingfilter,
newcrawlingdepth, newcrawlingdepth,
sitemapURLStr, sitemapURL, CrawlProfile.KEYWORDS_USER,
newcrawlingMustMatch,
CrawlProfile.MATCH_NEVER,
newcrawlingdepth,
crawlingIfOlder, crawlingDomFilterDepth, crawlingDomMaxPages,
crawlingQ,
indexText, indexMedia,
Expand Down
3 changes: 0 additions & 3 deletions htroot/sharedBlacklist_p.java
Expand Up @@ -104,7 +104,6 @@ public static serverObjects respond(final httpRequestHeader header, final server
final String Hash = post.get("hash");

// generate the download URL
String downloadURL = null;
String downloadURLOld = null;
if( sb.webIndex.seedDB != null ){ //no nullpointer error..
final yacySeed seed = sb.webIndex.seedDB.getConnected(Hash);
Expand All @@ -113,8 +112,6 @@ public static serverObjects respond(final httpRequestHeader header, final server
final String Port = seed.get(yacySeed.PORT, "8080");
final String peerName = seed.get(yacySeed.NAME, "<" + IP + ":" + Port + ">");
prop.putHTML("page_source", peerName);

downloadURL = "http://" + IP + ":" + Port + "/xml/blacklists.xml";
downloadURLOld = "http://" + IP + ":" + Port + "/yacy/list.html?col=black";
} else {
prop.put("status", STATUS_PEER_UNKNOWN);//YaCy-Peer not found
Expand Down

0 comments on commit dba7ef5

Please sign in to comment.