extended crawling constraints:

- removed never-used secondary crawl depth - added a must-not-match filter that can be used to exclude urls from a crawl - added stub for crawl tags which will be used to identify search results that had been produced from specific crawls please update the yacybar: replace property name 'crawlFilter' with 'mustmatch'. Additionally, a new parameter named 'mustnotmatch' can be used, which should be by default the empty sring (match-never) git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5342 6c8d7289-2bf4-0310-a012-ef5d649a1542
yacy · Nov 14, 2008 · dba7ef5 · dba7ef5
1 parent 96174b2
commit dba7ef5
Show file tree

Hide file tree

Showing 16 changed files with 193 additions and 123 deletions.
diff --git a/htroot/CrawlProfileEditor_p.html b/htroot/CrawlProfileEditor_p.html
@@ -30,7 +30,8 @@ <h2>Crawl Profile Editor</h2>
     <td><strong>Status</strong></td>
     <td><strong>Start URL</strong></td>
     <td><strong>Depth</strong></td>
-    <td><strong>Filter</strong></td>
+    <td><strong>Must Match</strong></td>
+    <td><strong>Must Not Match</strong></td>
     <td><strong>MaxAge</strong></td>
     <td><strong>Auto Filter Depth</strong></td>
     <td><strong>Auto Filter Content</strong></td>
@@ -48,7 +49,8 @@ <h2>Crawl Profile Editor</h2>
     <td>#(status)#terminated::active#(/status)#</td>
     <td><a  href="#[startURL]#">#[startURL]#</a></td>
     <td>#[depth]#</td>
-    <td>#[filter]#</td>
+    <td>#[mustmatch]#</td>
+    <td>#[mustnotmatch]#</td>
     <td>#[crawlingIfOlder]#</td>
     <td>#[crawlingDomFilterDepth]#</td>
     <td>#{crawlingDomFilterContent}##[item]#<br />#{/crawlingDomFilterContent}#</td>

diff --git a/htroot/CrawlProfileEditor_p.java b/htroot/CrawlProfileEditor_p.java
@@ -62,10 +62,8 @@ public eentry(final String name, final String label, final boolean readonly, fin
     static {
         labels.add(new eentry(entry.NAME,             "Name",                 true,  eentry.STRING));
         labels.add(new eentry(entry.START_URL,        "Start URL",            true,  eentry.STRING));
-        labels.add(new eentry(entry.GENERAL_FILTER,   "General Filter",       false, eentry.STRING));
-        labels.add(new eentry(entry.SPECIFIC_FILTER,  "Specific Filter",      false, eentry.STRING));
-        labels.add(new eentry(entry.GENERAL_DEPTH,    "General Depth",        false, eentry.INTEGER));
-        labels.add(new eentry(entry.SPECIFIC_DEPTH,   "Specific Depth",       false, eentry.INTEGER));
+        labels.add(new eentry(entry.FILTER_MUSTMATCH,   "General Filter",       false, eentry.STRING));
+        labels.add(new eentry(entry.DEPTH,    "General Depth",        false, eentry.INTEGER));
         labels.add(new eentry(entry.RECRAWL_IF_OLDER, "Recrawl If Older",     false, eentry.INTEGER));
         labels.add(new eentry(entry.DOM_FILTER_DEPTH, "Domain Filter Depth",  false, eentry.INTEGER));
         labels.add(new eentry(entry.DOM_MAX_PAGES,    "Domain Max. Pages",    false, eentry.INTEGER));
@@ -214,8 +212,9 @@ private static void putProfileEntry(final servletProperties prop, final CrawlPro
         prop.put("crawlProfiles_" + count + "_name", profile.name());
         prop.putXML("crawlProfiles_" + count + "_startURL", profile.startURL());
         prop.put("crawlProfiles_" + count + "_handle", profile.handle());
-        prop.put("crawlProfiles_" + count + "_depth", profile.generalDepth());
-        prop.put("crawlProfiles_" + count + "_filter", profile.generalFilter());
+        prop.put("crawlProfiles_" + count + "_depth", profile.depth());
+        prop.put("crawlProfiles_" + count + "_mustmatch", profile.mustMatchPattern().toString());
+        prop.put("crawlProfiles_" + count + "_mustnotmatch", profile.mustNotMatchPattern().toString());
         prop.put("crawlProfiles_" + count + "_crawlingIfOlder", (profile.recrawlIfOlder() == 0L) ? "no re-crawl" : ""+ SimpleDateFormat.getDateTimeInstance().format(profile.recrawlIfOlder()));
         prop.put("crawlProfiles_" + count + "_crawlingDomFilterDepth", (profile.domFilterDepth() == Integer.MAX_VALUE) ? "inactive" : Integer.toString(profile.domFilterDepth()));
 

diff --git a/htroot/CrawlProfileEditor_p.xml b/htroot/CrawlProfileEditor_p.xml
@@ -6,7 +6,8 @@
 		<status>#(status)#terminated::active#(/status)#</status>
 		<starturl>#[startURL]#</starturl>
 		<depth>#[depth]#</depth>
-		<filter>#[filter]#</filter>
+		<mustmatch>#[mustmatch]#</mustmatch>
+		<mustnotmatch>#[mustnotmatch]#</mustnotmatch>
 		<crawlingIfOlder>#[crawlingIfOlder]#</crawlingIfOlder>
 		<crawlingDomFilterDepth>#[crawlingDomFilterDepth]#</crawlingDomFilterDepth>
 		<crawlingDomFilterContent>

diff --git a/htroot/CrawlStart_p.html b/htroot/CrawlStart_p.html
@@ -100,18 +100,30 @@ <h2>Crawl Start</h2>
           </td>
         </tr>
         <tr valign="top" class="TableCellDark">
-          <td><label for="crawlingFilter">Crawling Filter</label>:</td>
+          <td><label for="mustmatch">Must-Match Filter</label>:</td>
           <td>
 			<input type="radio" name="range" value="wide" checked="checked" />Use filter&nbsp;&nbsp;
-			<input name="crawlingFilter" id="crawlingFilter" type="text" size="20" maxlength="100" value="#[crawlingFilter]#" /><br />
+			<input name="mustmatch" id="mustmatch" type="text" size="60" maxlength="100" value="#[mustmatch]#" /><br />
 			<input type="radio" name="range" value="domain" />Restrict to start domain<br />
 			<input type="radio" name="range" value="subpath" />Restrict to sub-path
 		  </td>
           <td>
-            The filter is an emacs-like regular expression that must match with the URLs which are used to be crawled; default is 'catch all'.
+            The filter is an emacs-like regular expression that must match with the URLs which are used to be crawled;
+            default is 'catch all'.
             You can also use an automatic domain-restriction to fully crawl a single domain.
           </td>
         </tr>
+        <tr valign="top" class="TableCellDark">
+          <td><label for="mustnotmatch">Must-Not-Match Filter</label>:</td>
+          <td>
+			<input name="mustnotmatch" id="mustnotmatch" type="text" size="80" maxlength="100" value="#[mustnotmatch]#" />
+		  </td>
+          <td>
+            This filter must not match to allow that the page is accepted for crawling.
+            The empty string is a never-match filter which should do well for most cases.
+            If you don't know what this means, please leave this field empty.
+          </td>
+        </tr>
         <tr valign="top" class="TableCellLight">
           <td>Re-crawl known URLs:</td>
           <td>

diff --git a/htroot/CrawlStart_p.java b/htroot/CrawlStart_p.java
@@ -24,6 +24,7 @@
 // along with this program; if not, write to the Free Software
 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 
+import de.anomic.crawler.CrawlProfile;
 import de.anomic.http.httpRequestHeader;
 import de.anomic.plasma.plasmaSwitchboard;
 import de.anomic.plasma.plasmaSwitchboardConstants;
@@ -44,7 +45,8 @@ public static serverObjects respond(final httpRequestHeader header, final server
         prop.put("starturl", (intranet) ? repository : "http://");
         prop.put("proxyPrefetchDepth", env.getConfig("proxyPrefetchDepth", "0"));
         prop.put("crawlingDepth", env.getConfig("crawlingDepth", "0"));
-        prop.put("crawlingFilter", (intranet) ? repository + ".*" : ".*");
+        prop.put("mustmatch", (intranet) ? repository + ".*" : CrawlProfile.MATCH_ALL);
+        prop.put("mustnotmatch", CrawlProfile.MATCH_NEVER);
 
         prop.put("crawlingIfOlderCheck", "0");
         prop.put("crawlingIfOlderUnitYearCheck", "0");

diff --git a/htroot/QuickCrawlLink_p.java b/htroot/QuickCrawlLink_p.java
@@ -91,7 +91,8 @@ public static serverObjects respond(final httpRequestHeader header, final server
         final String title = post.get("title",null);
 
         // getting other parameters if set
-        final String crawlingFilter  = post.get("crawlingFilter", ".*");
+        final String crawlingMustMatch  = post.get("mustmatch", CrawlProfile.MATCH_ALL);
+        final String crawlingMustNotMatch  = post.get("mustnotmatch", CrawlProfile.MATCH_NEVER);
         final int CrawlingDepth      = Integer.parseInt(post.get("crawlingDepth", "0"));        
         final boolean crawlDynamic   = post.get("crawlingQ", "").equals("on");
         final boolean indexText      = post.get("indexText", "on").equals("on");
@@ -129,11 +130,11 @@ public static serverObjects respond(final httpRequestHeader header, final server
             try {
                 pe = sb.webIndex.profilesActiveCrawls.newEntry(
                         crawlingStartURL.getHost(), 
-                        crawlingStartURL, 
-                        crawlingFilter, 
-                        crawlingFilter, 
-                        CrawlingDepth, 
-                        CrawlingDepth, 
+                        crawlingStartURL,
+                        CrawlProfile.KEYWORDS_USER,
+                        crawlingMustMatch,
+                        crawlingMustNotMatch,
+                        CrawlingDepth,
                         60 * 24 * 30, // recrawlIfOlder (minutes); here: one month
                         -1, // domFilterDepth, if negative: no auto-filter
                         -1, // domMaxPages, if negative: no count restriction

diff --git a/htroot/WatchCrawler_p.java b/htroot/WatchCrawler_p.java
@@ -123,16 +123,16 @@ public static serverObjects respond(final httpRequestHeader header, final server
                     crawlingStart = (crawlingStartURL == null) ? null : crawlingStartURL.toNormalform(true, true);
 
                     // set the crawling filter
-                    String newcrawlingfilter = post.get("crawlingFilter", ".*");
-                    if (newcrawlingfilter.length() < 2) newcrawlingfilter = ".*"; // avoid that all urls are filtered out if bad value was submitted
-
+                    String newcrawlingMustMatch = post.get("mustmatch", CrawlProfile.MATCH_ALL);
+                    String newcrawlingMustNotMatch = post.get("mustnotmatch", CrawlProfile.MATCH_NEVER);
+                    if (newcrawlingMustMatch.length() < 2) newcrawlingMustMatch = CrawlProfile.MATCH_ALL; // avoid that all urls are filtered out if bad value was submitted
+                    // special cases:
                     if (crawlingStartURL!= null && fullDomain) {
-                        newcrawlingfilter = ".*" + crawlingStartURL.getHost() + ".*";
+                        newcrawlingMustMatch = ".*" + crawlingStartURL.getHost() + ".*";
                     }
                     if (crawlingStart!= null && subPath && (pos = crawlingStart.lastIndexOf("/")) > 0) {
-                        newcrawlingfilter = crawlingStart.substring(0, pos + 1) + ".*";
+                        newcrawlingMustMatch = crawlingStart.substring(0, pos + 1) + ".*";
                     }
-                    env.setConfig("crawlingFilter", newcrawlingfilter);
 
                     final boolean crawlOrder = post.get("crawlOrder", "off").equals("on");
                     env.setConfig("crawlOrder", (crawlOrder) ? "true" : "false");
@@ -183,12 +183,12 @@ public static serverObjects respond(final httpRequestHeader header, final server
                         if ((crawlingStart == null || crawlingStartURL == null) /* || (!(crawlingStart.matches(newcrawlingfilter))) */) {
                             // print error message
                             prop.put("info", "4"); //crawlfilter does not match url
-                            prop.putHTML("info_newcrawlingfilter", newcrawlingfilter);
+                            prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
                             prop.putHTML("info_crawlingStart", crawlingStart);
                         } else try {
 
                             // check if the crawl filter works correctly
-                            Pattern.compile(newcrawlingfilter);
+                            Pattern.compile(newcrawlingMustMatch);
 
                             // stack request
                             // first delete old entry, if exists
@@ -201,8 +201,12 @@ public static serverObjects respond(final httpRequestHeader header, final server
                             // stack url
                             sb.webIndex.profilesPassiveCrawls.removeEntry(crawlingStartURL.hash()); // if there is an old entry, delete it
                             final CrawlProfile.entry pe = sb.webIndex.profilesActiveCrawls.newEntry(
-                                    crawlingStartURL.getHost(), crawlingStartURL, newcrawlingfilter, newcrawlingfilter,
-                                    newcrawlingdepth, newcrawlingdepth,
+                                    crawlingStartURL.getHost(),
+                                    crawlingStartURL,
+                                    CrawlProfile.KEYWORDS_USER,
+                                    newcrawlingMustMatch,
+                                    newcrawlingMustNotMatch,
+                                    newcrawlingdepth,
                                     crawlingIfOlder, crawlingDomFilterDepth, crawlingDomMaxPages,
                                     crawlingQ,
                                     indexText, indexMedia,
@@ -270,7 +274,7 @@ public static serverObjects respond(final httpRequestHeader header, final server
                             }
                         } catch (final PatternSyntaxException e) {
                             prop.put("info", "4"); //crawlfilter does not match url
-                            prop.putHTML("info_newcrawlingfilter", newcrawlingfilter);
+                            prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
                             prop.putHTML("info_error", e.getMessage());
                         } catch (final Exception e) {
                             // mist
@@ -286,7 +290,7 @@ public static serverObjects respond(final httpRequestHeader header, final server
                             final String fileName = post.get("crawlingFile");  
                             try {
                                 // check if the crawl filter works correctly
-                                Pattern.compile(newcrawlingfilter);
+                                Pattern.compile(newcrawlingMustMatch);
 
                                 // loading the file content
                                 final File file = new File(fileName);
@@ -306,7 +310,21 @@ public static serverObjects respond(final httpRequestHeader header, final server
 
                                 // creating a crawler profile
                                 final yacyURL crawlURL = new yacyURL("file://" + file.toString(), null);
-                                final CrawlProfile.entry profile = sb.webIndex.profilesActiveCrawls.newEntry(fileName, crawlURL, newcrawlingfilter, newcrawlingfilter, newcrawlingdepth, newcrawlingdepth, crawlingIfOlder, crawlingDomFilterDepth, crawlingDomMaxPages, crawlingQ, indexText, indexMedia, storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw);
+                                final CrawlProfile.entry profile = sb.webIndex.profilesActiveCrawls.newEntry(
+                                        fileName, crawlURL, CrawlProfile.KEYWORDS_USER,
+                                        newcrawlingMustMatch,
+                                        CrawlProfile.MATCH_NEVER,
+                                        newcrawlingdepth,
+                                        crawlingIfOlder,
+                                        crawlingDomFilterDepth,
+                                        crawlingDomMaxPages,
+                                        crawlingQ,
+                                        indexText,
+                                        indexMedia,
+                                        storeHTCache,
+                                        true,
+                                        crawlOrder,
+                                        xsstopw, xdstopw, xpstopw);
 
                                 // pause local crawl here
                                 sb.pauseCrawlJob(plasmaSwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
@@ -333,7 +351,7 @@ public static serverObjects respond(final httpRequestHeader header, final server
                             } catch (final PatternSyntaxException e) {
                                 // print error message
                                 prop.put("info", "4"); //crawlfilter does not match url
-                                prop.putHTML("info_newcrawlingfilter", newcrawlingfilter);
+                                prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
                                 prop.putHTML("info_error", e.getMessage());
                             } catch (final Exception e) {
                                 // mist
@@ -353,8 +371,10 @@ public static serverObjects respond(final httpRequestHeader header, final server
 
                     		// create a new profile
                     		final CrawlProfile.entry pe = sb.webIndex.profilesActiveCrawls.newEntry(
-                    				sitemapURLStr, sitemapURL, newcrawlingfilter, newcrawlingfilter,
-                    				newcrawlingdepth, newcrawlingdepth,
+                    				sitemapURLStr, sitemapURL, CrawlProfile.KEYWORDS_USER,
+                    				newcrawlingMustMatch,
+                    				CrawlProfile.MATCH_NEVER,
+                    				newcrawlingdepth,
                     				crawlingIfOlder, crawlingDomFilterDepth, crawlingDomMaxPages,
                     				crawlingQ,
                     				indexText, indexMedia,

diff --git a/htroot/sharedBlacklist_p.java b/htroot/sharedBlacklist_p.java
@@ -104,7 +104,6 @@ public static serverObjects respond(final httpRequestHeader header, final server
                 final String Hash = post.get("hash");
 
                 // generate the download URL
-                String downloadURL = null;
                 String downloadURLOld = null;
                 if( sb.webIndex.seedDB != null ){ //no nullpointer error..
                     final yacySeed seed = sb.webIndex.seedDB.getConnected(Hash); 
@@ -113,8 +112,6 @@ public static serverObjects respond(final httpRequestHeader header, final server
                         final String Port = seed.get(yacySeed.PORT, "8080");
                         final String peerName = seed.get(yacySeed.NAME, "<" + IP + ":" + Port + ">");
                         prop.putHTML("page_source", peerName);
-
-                        downloadURL = "http://" + IP + ":" + Port + "/xml/blacklists.xml";
                         downloadURLOld = "http://" + IP + ":" + Port + "/yacy/list.html?col=black";
                     } else {
                         prop.put("status", STATUS_PEER_UNKNOWN);//YaCy-Peer not found