fixed a number of small bugs:

- better crawl star for files paths and smb paths - added time-out wrapper for dns resolving and reverse resolving to prevent blockings - fixed intranet scanner result list check boxes - prevented htcache usage in case of file and smb crawling (not necessary, documents are locally available) - fixed rss feed loader - fixes sitemap loader which had not been restricted to single files (crawl-depth must be zero) - clearing of crawl result lists when a network switch was done - higher maximum file size for crawler git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7214 6c8d7289-2bf4-0310-a012-ef5d649a1542
yacy · Sep 30, 2010 · 2c549ae · 2c549ae
1 parent f6eebb6
commit 2c549ae
Show file tree

Hide file tree

Showing 22 changed files with 229 additions and 99 deletions.
diff --git a/defaults/yacy.init b/defaults/yacy.init
@@ -674,10 +674,10 @@ crawler.clientTimeout=9000
 crawler.http.acceptEncoding=gzip
 crawler.http.acceptLanguage=en-us,en;q=0.5
 crawler.http.acceptCharset=ISO-8859-1,utf-8;q=0.7,*;q=0.7
-crawler.http.maxFileSize=1048576
+crawler.http.maxFileSize=10485760
 
 # ftp crawler specific settings; size in bytes
-crawler.ftp.maxFileSize=1048576
+crawler.ftp.maxFileSize=10485760
 
 # smb crawler specific settings: maximum size
 crawler.smb.maxFileSize=100000000

diff --git a/htroot/CrawlProfileEditor_p.java b/htroot/CrawlProfileEditor_p.java
@@ -92,7 +92,6 @@ public eentry(final String name, final String label, final boolean readonly, fin
         labels.add(new eentry(CrawlProfile.INDEX_TEXT,          "Index Text",            false, eentry.BOOLEAN));
         labels.add(new eentry(CrawlProfile.INDEX_MEDIA,         "Index Media",           false, eentry.BOOLEAN));
         labels.add(new eentry(CrawlProfile.STORE_HTCACHE,       "Store in HTCache",      false, eentry.BOOLEAN));
-        labels.add(new eentry(CrawlProfile.STORE_TXCACHE,       "Store in TXCache",      false, eentry.BOOLEAN));
         labels.add(new eentry(CrawlProfile.REMOTE_INDEXING,     "Remote Indexing",       false, eentry.BOOLEAN));
         labels.add(new eentry(CrawlProfile.XSSTOPW,             "Static stop-words",     false, eentry.BOOLEAN));
         labels.add(new eentry(CrawlProfile.XDSTOPW,             "Dynamic stop-words",    false, eentry.BOOLEAN));

diff --git a/htroot/CrawlResults.java b/htroot/CrawlResults.java
@@ -188,15 +188,16 @@ public static serverObjects respond(final RequestHeader header, serverObjects po
                 try {
                     urle = sb.indexSegments.urlMetadata(Segments.Process.LOCALCRAWLING).load(entry.getKey().getBytes(), null, 0);
                     if (urle == null) {
-                        Log.logWarning("PLASMA", "CrawlResults: URL not in index with url hash "+ entry.getKey());
+                        Log.logWarning("PLASMA", "CrawlResults: URL not in index with url hash " + entry.getKey());
                         urlstr = null;
                         urltxt = null;
                         metadata = null;
-                    } else {
-                        metadata = urle.metadata();
-                        urlstr = metadata.url().toNormalform(false, true);
-                        urltxt = nxTools.shortenURLString(urlstr, 72); // shorten the string text like a URL
+                        continue;
                     }
+                    metadata = urle.metadata();
+                    urlstr = metadata.url().toNormalform(false, true);
+                    urltxt = nxTools.shortenURLString(urlstr, 72); // shorten the string text like a URL
+
                     initiatorSeed = entry.getValue() == null || entry.getValue().initiatorHash == null ? null : sb.peers.getConnected(new String(entry.getValue().initiatorHash));
                     executorSeed = entry.getValue() == null || entry.getValue().executorHash == null ? null : sb.peers.getConnected(new String(entry.getValue().executorHash));
 

diff --git a/htroot/CrawlStartIntranet_p.html b/htroot/CrawlStartIntranet_p.html
@@ -36,7 +36,7 @@ <h2>Intranet Crawl Start</h2>
 	#(/notintranet)#
 
 	#(servertable)#::
-	<form id="servertable" action="CrawlStartIntranet_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8" ><fieldset>
+	<form id="servertable" name="servertable" action="CrawlStartIntranet_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8" ><fieldset>
 	  <legend><label for="servertable">Available Intranet Server</label></legend>
       <table class="sortable" border="0" cellpadding="2" cellspacing="1">
         <tr class="TableHeader" valign="bottom">

diff --git a/htroot/CrawlStartSite_p.html b/htroot/CrawlStartSite_p.html
@@ -45,8 +45,7 @@ <h2>Site Crawling</h2>
             <span id="robotsOK"></span><img align="top" src="/env/grafics/empty.gif" name="ajax" alt="empty" />
             </td>
           </tr><tr>
-            <td><input type="radio" name="crawlingMode" id="sitelist" value="sitelist" disabled="disabled"
-            onmousedown="document.getElementById('rangeDomain').disabled=true;document.getElementById('rangeSubpath').disabled=true;document.getElementById('crawlingDomMaxCheck').disabled=true;document.getElementById('crawlingDomMaxPages').disabled=true;document.getElementById('crawlingQ').disabled=true;"/>Link-List of URL</td>
+            <td><input type="radio" name="crawlingMode" id="sitelist" value="sitelist" disabled="disabled" />Link-List of URL</td>
             <td><div id="sitelistURLs"></div></td>
           </tr><tr>
             <td><input type="radio" name="crawlingMode" id="sitemap" value="sitemap" disabled="disabled"

diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java
@@ -138,14 +138,13 @@ public static serverObjects respond(final RequestHeader header, final serverObje
                 final boolean fullDomain = post.get("range", "wide").equals("domain"); // special property in simple crawl start
                 final boolean subPath    = post.get("range", "wide").equals("subpath"); // special property in simple crawl start
 
-
                 // set the crawl filter
                 String newcrawlingMustMatch = post.get("mustmatch", CrawlProfile.MATCH_ALL);
                 String newcrawlingMustNotMatch = post.get("mustnotmatch", CrawlProfile.MATCH_NEVER);
                 if (newcrawlingMustMatch.length() < 2) newcrawlingMustMatch = CrawlProfile.MATCH_ALL; // avoid that all urls are filtered out if bad value was submitted
                 // special cases:
                 if (crawlingStartURL!= null && fullDomain) {
-                    newcrawlingMustMatch = ".*" + crawlingStartURL.getHost() + ".*";
+                    newcrawlingMustMatch = crawlingStartURL.isFile() ? "file:///.*" : crawlingStartURL.isSMB() ? "smb://.*" : ".*" + crawlingStartURL.getHost() + ".*";
                 }
                 if (crawlingStart!= null && subPath && (pos = crawlingStart.lastIndexOf('/')) > 0) {
                     newcrawlingMustMatch = crawlingStart.substring(0, pos + 1) + ".*";
@@ -203,7 +202,8 @@ public static serverObjects respond(final RequestHeader header, final serverObje
                 final boolean indexMedia = post.get("indexMedia", "off").equals("on");
                 env.setConfig("indexMedia", (indexMedia) ? "true" : "false");
 
-                final boolean storeHTCache = post.get("storeHTCache", "off").equals("on");
+                boolean storeHTCache = post.get("storeHTCache", "off").equals("on");
+                if (crawlingStartURL.isFile() || crawlingStartURL.isSMB()) storeHTCache = false;
                 env.setConfig("storeHTCache", (storeHTCache) ? "true" : "false");
 
                 final String cachePolicyString = post.get("cachePolicy", "iffresh");
@@ -247,15 +247,21 @@ public static serverObjects respond(final RequestHeader header, final serverObje
                         // stack url
                         sb.crawler.profilesPassiveCrawls.remove(crawlingStartURL.hash()); // if there is an old entry, delete it
                         final CrawlProfile pe = new CrawlProfile(
-                                (crawlingStartURL.getHost() == null) ? Long.toHexString(System.currentTimeMillis()) : crawlingStartURL.getHost(),
+                                (crawlingStartURL.getHost() == null) ? crawlingStartURL.toNormalform(true, false) : crawlingStartURL.getHost(),
                                 crawlingStartURL,
                                 newcrawlingMustMatch,
                                 newcrawlingMustNotMatch,
                                 newcrawlingdepth,
-                                crawlingIfOlder, crawlingDomMaxPages,
+                                crawlingIfOlder,
+                                crawlingDomMaxPages,
                                 crawlingQ,
                                 indexText, indexMedia,
-                                storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw, cachePolicy);
+                                storeHTCache,
+                                crawlOrder,
+                                xsstopw,
+                                xdstopw,
+                                xpstopw,
+                                cachePolicy);
                         sb.crawler.profilesActiveCrawls.put(pe.handle().getBytes(), pe);
                         final String reasonString = sb.crawlStacker.stackCrawl(new Request(
                                 sb.peers.mySeed().hash.getBytes(),
@@ -352,7 +358,8 @@ public static serverObjects respond(final RequestHeader header, final serverObje
                             final Map<MultiProtocolURI, String> hyperlinks = scraper.getAnchors();
                             final DigestURI crawlURL = new DigestURI("file://" + file.toString(), null);
                             final CrawlProfile profile = new CrawlProfile(
-                                    fileName, crawlURL,
+                                    fileName,
+                                    crawlURL,
                                     newcrawlingMustMatch,
                                     CrawlProfile.MATCH_NEVER,
                                     newcrawlingdepth,
@@ -362,9 +369,10 @@ public static serverObjects respond(final RequestHeader header, final serverObje
                                     indexText,
                                     indexMedia,
                                     storeHTCache,
-                                    true,
                                     crawlOrder,
-                                    xsstopw, xdstopw, xpstopw,
+                                    xsstopw,
+                                    xdstopw,
+                                    xpstopw,
                                     cachePolicy);
                             sb.crawler.profilesActiveCrawls.put(profile.handle().getBytes(), profile);
                             sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
@@ -405,15 +413,21 @@ public static serverObjects respond(final RequestHeader header, final serverObje
                 	try {
                 		final DigestURI sitemapURL = new DigestURI(sitemapURLStr, null);
                 		final CrawlProfile pe = new CrawlProfile(
-                				sitemapURLStr, sitemapURL,
-                				newcrawlingMustMatch,
+                				sitemapURLStr,
+                				sitemapURL,
+                				CrawlProfile.MATCH_ALL,
                 				CrawlProfile.MATCH_NEVER,
-                				newcrawlingdepth,
-                				crawlingIfOlder, crawlingDomMaxPages,
-                				crawlingQ,
-                				indexText, indexMedia,
-                				storeHTCache, true, crawlOrder,
-                				xsstopw, xdstopw, xpstopw,
+                				0,
+                				crawlingIfOlder,
+                				crawlingDomMaxPages,
+                				true,
+                				indexText,
+                				indexMedia,
+                				storeHTCache,
+                				crawlOrder,
+                				xsstopw,
+                				xdstopw,
+                				xpstopw,
                 				cachePolicy);
                 		sb.crawler.profilesActiveCrawls.put(pe.handle().getBytes(), pe);
                 		final SitemapImporter importer = new SitemapImporter(sb, sitemapURL, pe);
@@ -431,7 +445,7 @@ public static serverObjects respond(final RequestHeader header, final serverObje
                         // download document
                         ContentScraper scraper = null;
                         scraper = sb.loader.parseResource(sitelistURL, CrawlProfile.CacheStrategy.IFFRESH);
-                        String title = scraper.getTitle();
+                        // String title = scraper.getTitle();
                         // String description = scraper.getDescription();
 
                         // get links and generate filter
@@ -444,7 +458,7 @@ public static serverObjects respond(final RequestHeader header, final serverObje
 
                         // put links onto crawl queue
                         final CrawlProfile profile = new CrawlProfile(
-                                title == null || title.length() == 0 ? sitelistURL.getHost() : title,
+                                sitelistURL.getHost(),
                                 sitelistURL,
                                 newcrawlingMustMatch,
                                 CrawlProfile.MATCH_NEVER,
@@ -455,9 +469,10 @@ public static serverObjects respond(final RequestHeader header, final serverObje
                                 indexText,
                                 indexMedia,
                                 storeHTCache,
-                                true,
                                 crawlOrder,
-                                xsstopw, xdstopw, xpstopw,
+                                xsstopw,
+                                xdstopw,
+                                xpstopw,
                                 cachePolicy);
                         sb.crawler.profilesActiveCrawls.put(profile.handle().getBytes(), profile);
                         sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);

diff --git a/htroot/QuickCrawlLink_p.java b/htroot/QuickCrawlLink_p.java
@@ -157,7 +157,6 @@ public static serverObjects respond(final RequestHeader header, final serverObje
                         indexText,
                         indexMedia,
                         storeHTCache,
-                        true,
                         remoteIndexing,
                         xsstopw,
                         xdstopw,

diff --git a/htroot/SettingsAck_p.java b/htroot/SettingsAck_p.java
@@ -36,6 +36,7 @@
 import java.util.regex.PatternSyntaxException;
 
 import net.yacy.cora.document.MultiProtocolURI;
+import net.yacy.cora.protocol.Domains;
 import net.yacy.cora.protocol.RequestHeader;
 import net.yacy.kelondro.order.Base64Order;
 import net.yacy.kelondro.order.Digest;
@@ -111,7 +112,7 @@ public static serverObjects respond(final RequestHeader header, final serverObje
                 final serverCore theServerCore = (serverCore) env.getThread("10_httpd");
                 try {
                     final InetSocketAddress theNewAddress = theServerCore.generateSocketAddress(port);
-                    final String hostName = theNewAddress.getHostName();
+                    final String hostName = Domains.getHostName(theNewAddress.getAddress());
                     prop.put("info_restart", "1");
                     prop.put("info_restart_ip",(hostName.equals("0.0.0.0"))? "localhost" : hostName);
                     prop.put("info_restart_port", theNewAddress.getPort());

diff --git a/htroot/api/util/getpageinfo_p.java b/htroot/api/util/getpageinfo_p.java
@@ -26,11 +26,13 @@ public static serverObjects respond(final RequestHeader header, final serverObje
         prop.put("robots-allowed", "3"); //unknown
         prop.put("sitemap", "");
         prop.put("favicon","");        
+        prop.put("sitelist", "");
+        prop.put("filter", ".*");
 
         // default actions
         String actions="title,robots";
 
-        if(post!=null && post.containsKey("url")){
+        if (post != null && post.containsKey("url")) {
             if(post.containsKey("actions"))
                 actions=post.get("actions");
             String url=post.get("url");
@@ -97,7 +99,7 @@ public static serverObjects respond(final RequestHeader header, final serverObje
                     prop.putXML("filter", filter.length() > 0 ? filter.substring(1) : ".*");
                 }
             }
-            if(actions.indexOf("robots")>=0){
+            if (actions.indexOf("robots")>=0) {
                 try {
                     final DigestURI theURL = new DigestURI(url, null);
 

diff --git a/htroot/js/IndexCreate.js b/htroot/js/IndexCreate.js
@@ -48,14 +48,14 @@ function handleResponse(){
 		        sitemap=response.getElementsByTagName("sitemap")[0].firstChild.nodeValue;
 		    }		
 			document.getElementsByName("sitemapURL")[0].value=sitemap;
-			document.getElementById("sitemap").disabled=false;
+			if (sitemap) document.getElementById("sitemap").disabled=false;
 		}
 			sitelist="";		
 	        if (response.getElementsByTagName("sitelist")[0].firstChild!=null){
 		        sitelist=response.getElementsByTagName("sitelist")[0].firstChild.nodeValue;
 		    }
 			document.getElementById("sitelistURLs").innerHTML = sitelist;
-			document.getElementById("sitelist").disabled=false;
+			if (sitelist) document.getElementById("sitelist").disabled=false;
 
 		// clear the ajax image
 		document.getElementsByName("ajax")[0].setAttribute("src", AJAX_OFF);

diff --git a/source/de/anomic/crawler/CrawlProfile.java b/source/de/anomic/crawler/CrawlProfile.java
@@ -53,7 +53,6 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
     public static final String INDEX_TEXT       = "indexText";
     public static final String INDEX_MEDIA      = "indexMedia";
     public static final String STORE_HTCACHE    = "storeHTCache";
-    public static final String STORE_TXCACHE    = "storeTXCache";
     public static final String REMOTE_INDEXING  = "remoteIndexing";
     public static final String XSSTOPW          = "xsstopw";
     public static final String XDSTOPW          = "xdstopw";
@@ -64,17 +63,22 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
     private Pattern mustmatch = null, mustnotmatch = null;
 
 
-    public CrawlProfile(final String name, final DigestURI startURL,
+    public CrawlProfile(
+                 final String name,
+                 final DigestURI startURL,
                  final String mustmatch,
                  final String mustnotmatch,
                  final int depth,
                  final long recrawlIfOlder /*date*/,
                  final int domMaxPages,
                  final boolean crawlingQ,
-                 final boolean indexText, final boolean indexMedia,
-                 final boolean storeHTCache, final boolean storeTXCache,
+                 final boolean indexText,
+                 final boolean indexMedia,
+                 final boolean storeHTCache,
                  final boolean remoteIndexing,
-                 final boolean xsstopw, final boolean xdstopw, final boolean xpstopw,
+                 final boolean xsstopw,
+                 final boolean xdstopw,
+                 final boolean xpstopw,
                  final CacheStrategy cacheStrategy) {
         super(40);
         if (name == null || name.length() == 0) throw new NullPointerException("name must not be null");
@@ -91,7 +95,6 @@ public CrawlProfile(final String name, final DigestURI startURL,
         put(INDEX_TEXT,       indexText);
         put(INDEX_MEDIA,      indexMedia);
         put(STORE_HTCACHE,    storeHTCache);
-        put(STORE_TXCACHE,    storeTXCache);
         put(REMOTE_INDEXING,  remoteIndexing);
         put(XSSTOPW,          xsstopw); // exclude static stop-words
         put(XDSTOPW,          xdstopw); // exclude dynamic stop-word
@@ -218,11 +221,6 @@ public boolean storeHTCache() {
         if (r == null) return false;
         return (r.equals(Boolean.TRUE.toString()));
     }
-    public boolean storeTXCache() {
-        final String r = get(STORE_TXCACHE);
-        if (r == null) return false;
-        return (r.equals(Boolean.TRUE.toString()));
-    }
     public boolean remoteIndexing() {
         final String r = get(REMOTE_INDEXING);
         if (r == null) return false;