added a crawl start checker which makes a simple analysis on the list of

all given urls: shows if the url can be loaded and if there is a robots and/or a sitemap.
yacy · Oct 10, 2012 · abebb3b · abebb3b
1 parent 941873f
commit abebb3b
Show file tree

Hide file tree

Showing 4 changed files with 214 additions and 6 deletions.
diff --git a/htroot/CrawlCheck_p.html b/htroot/CrawlCheck_p.html
@@ -0,0 +1,74 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" >
+  <head>
+    <title>YaCy '#[clientname]#': Crawl Start</title>
+    #%env/templates/metas.template%#
+    <script type="text/javascript" src="/js/ajax.js"></script>
+    <script type="text/javascript" src="/js/IndexCreate.js"></script>
+    <script type="text/javascript">
+        function check(key){
+            document.getElementById(key).checked = 'checked';
+        }
+    </script>
+    <style type="text/css">
+    	.nobr {
+    		white-space: nowrap;
+    	}
+    </style>
+  </head>
+  <body id="IndexCreate">
+
+<div id="api"></div>
+
+    #%env/templates/header.template%#
+    #%env/templates/submenuIndexCreate.template%#
+    <h2>Crawl Check</h2>
+
+    <p>This pages gives you an analysis about the possible success for a web crawl on given addresses.</p>
+
+    <fieldset>
+      <legend>
+        <label>Crawl Check</label>
+      </legend>
+      <form id="CrawlCheck" method="post" action="CrawlCheck_p.html" enctype="multipart/form-data" accept-charset="UTF-8">
+      <dl>
+        <dt><label>List of possible crawl start URLs</label></dt>
+        <dd>
+           <textarea name="crawlingURLs" id="crawlingURLs" cols="80" rows="15" size="80">#[starturls]#</textarea>
+        </dd>      
+        <dt></dt>
+        <dd><input type="submit" name="crawlcheck" value="Check given urls" class="submitready"/>
+        </dd>
+      </dl>
+
+    </form>
+    </fieldset>
+
+
+    #(table)#::
+     <fieldset><legend>Analysis</legend>
+      <table border="0" cellpadding="2" cellspacing="2" style="float:left">
+      <tr>
+        <th align="left" width="600" class="listing">URL</th>
+        <th align="right" width="80" class="listing">Access</th>
+        <th align="right" width="80" class="listing">Robots</th>
+        <th align="right" width="80" class="listing">Crawl-Delay</th>
+        <th align="right" width="200" class="listing">Sitemap</th>
+      </tr>
+      #{list}#
+        <tr class="TableCell#(dark)#Light::Dark::Summary#(/dark)#">
+          <td align="left" nowrap class="listing"><a href="#[url]#" class="listing">#[url]#</a></td>
+          <td align="right" nowrap class="listing">#[access]#</td>
+          <td align="right" nowrap class="listing">#[robots]#</td>
+          <td align="right" nowrap class="listing">#[crawldelay]#</td>
+          <td align="right" nowrap class="listing">#[sitemap]#</td>
+        </tr>
+      #{/list}#
+      </table>
+    </fieldset>
+    #(/table)#
+
+
+    #%env/templates/footer.template%#
+  </body>
+</html>
diff --git a/htroot/CrawlCheck_p.java b/htroot/CrawlCheck_p.java
@@ -0,0 +1,132 @@
+/**
+ *  CrawlCheck_p
+ *  Copyright 2012 by Michael Peter Christen
+ *  First released 10.10.2011 at http://yacy.net
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General private
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ *
+ *  This library is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public License
+ *  along with this program in the file lgpl21.txt
+ *  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.io.IOException;
+import java.net.MalformedURLException;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.regex.Pattern;
+
+import net.yacy.cora.federate.yacy.CacheStrategy;
+import net.yacy.cora.protocol.RequestHeader;
+import net.yacy.crawler.data.CrawlQueues;
+import net.yacy.crawler.retrieval.Request;
+import net.yacy.crawler.retrieval.Response;
+import net.yacy.crawler.robots.RobotsTxtEntry;
+import net.yacy.kelondro.data.meta.DigestURI;
+import net.yacy.kelondro.logging.Log;
+import net.yacy.repository.Blacklist.BlacklistType;
+import net.yacy.search.Switchboard;
+import net.yacy.server.serverObjects;
+import net.yacy.server.serverSwitch;
+
+
+public class CrawlCheck_p {
+
+    public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, final serverObjects post, final serverSwitch env) {
+        final Switchboard sb = (Switchboard) env;
+        final serverObjects prop = new serverObjects();
+        prop.put("starturls", "");
+        if (post == null) return prop;
+
+        if (post.containsKey("crawlcheck")) {
+
+            // get the list of rootURls for this crawl start
+            Set<DigestURI> rootURLs = new HashSet<DigestURI>();
+            String crawlingStart0 = post.get("crawlingURLs","").trim();
+            String[] rootURLs0 = crawlingStart0.indexOf('\n') > 0 || crawlingStart0.indexOf('\r') > 0 ? crawlingStart0.split("[\\r\\n]+") : crawlingStart0.split(Pattern.quote("|"));
+            for (String crawlingStart: rootURLs0) {
+                if (crawlingStart == null || crawlingStart.length() == 0) continue;
+                // add the prefix http:// if necessary
+                int pos = crawlingStart.indexOf("://",0);
+                if (pos == -1) {
+                    if (crawlingStart.startsWith("www")) crawlingStart = "http://" + crawlingStart;
+                    if (crawlingStart.startsWith("ftp")) crawlingStart = "ftp://" + crawlingStart;
+                }
+                try {
+                    DigestURI crawlingStartURL = new DigestURI(crawlingStart);
+                    rootURLs.add(crawlingStartURL);
+                } catch (MalformedURLException e) {
+                    Log.logException(e);
+                }
+            }
+
+            if (rootURLs.size() == 0) {
+                prop.put("table", 0);
+            } else {
+                prop.put("table", 1);
+
+                // make a string that is used to fill the starturls field again
+                // and analyze the urls to make the table rows
+                StringBuilder s = new StringBuilder(300);
+                int row = 0;
+                for (DigestURI u: rootURLs) {
+                    s.append(u.toNormalform(true, true)).append('\n');
+                    prop.put("table_list_" + row + "_url", u.toNormalform(true, true));
+
+                    // try to load the robots
+                    RobotsTxtEntry robotsEntry;
+                    boolean robotsAllowed = true;
+                    try {
+                        robotsEntry = sb.robots.getEntry(u, sb.peers.myBotIDs());
+                        if (robotsEntry == null) {
+                            prop.put("table_list_" + row + "_robots", "no robots");
+                            prop.put("table_list_" + row + "_crawldelay", CrawlQueues.queuedMinLoadDelay + " ms");
+                            prop.put("table_list_" + row + "_sitemap", "");
+                        } else {
+                            robotsAllowed = !robotsEntry.isDisallowed(u);
+                            prop.put("table_list_" + row + "_robots", "robots exist: " + (robotsAllowed ? "crawl allowed" : "url disallowed"));
+                            prop.put("table_list_" + row + "_crawldelay", Math.max(CrawlQueues.queuedMinLoadDelay, robotsEntry.getCrawlDelayMillis()) + " ms");
+                            prop.put("table_list_" + row + "_sitemap", robotsEntry.getSitemap() == null ? "-" : robotsEntry.getSitemap().toNormalform(true, true));
+                        }                        
+                    } catch (final IOException e) {
+                    }
+
+                    // try to load the url
+                    if (robotsAllowed) try {
+                        Request request = sb.loader.request(u, true, false);
+                        final Response response = sb.loader.load(request, CacheStrategy.NOCACHE, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay);
+                        if (response == null) {
+                            prop.put("table_list_" + row + "_access", "no response");
+                        } else {
+                            if (response.getResponseHeader().getStatusCode() == 200) {
+                                prop.put("table_list_" + row + "_access", "200 ok, last-modified = " + response.lastModified());
+                            } else {
+                                prop.put("table_list_" + row + "_access", response.getResponseHeader().getStatusCode() + " - load failed");
+                            }
+                        }
+                    } catch (final IOException e) {
+                        prop.put("table_list_" + row + "_access", "error response: " + e.getMessage());
+                    } else {
+                        prop.put("table_list_" + row + "_access", "not loaded - prevented by robots.txt");
+                    }
+                    row++;
+
+                }
+                prop.put("table_list", row);
+                prop.put("starturls", s.toString());
+
+            }
+        }
+
+        return prop;
+    }
+
+}
diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java
@@ -3,12 +3,6 @@
 // first published 18.12.2006 on http://www.anomic.de
 // this file was created using the an implementation from IndexCreate_p.java, published 02.12.2004
 //
-// This is a part of YaCy, a peer-to-peer based web search engine
-//
-// $LastChangedDate$
-// $LastChangedRevision$
-// $LastChangedBy$
-//
 // LICENSE
 //
 // This program is free software; you can redistribute it and/or modify

diff --git a/htroot/env/templates/submenuIndexCreate.template b/htroot/env/templates/submenuIndexCreate.template
@@ -36,4 +36,12 @@
       <li><a href="/IndexImportMediawiki_p.html" class="MenuItemLink lock">Dump Reader for <br/>MediaWiki dumps</a></li>
     </ul>
   </div>
+
+  <div class="SubMenugroup">
+    <h3>Target Analysis</h3>
+    <ul class="SubMenu">
+      <li><a href="/CrawlCheck_p.html" class="MenuItemLink lock">Mass Crawl<br/>Check</a></li>
+      <li><a href="/RegexTest.html" class="MenuItemLink">Regex<br/>Test</a></li>
+    </ul>
+  </div>
 </div>