Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
added a crawl start checker which makes a simple analysis on the list of
all given urls: shows if the url can be loaded and if there is a robots and/or a sitemap.
- Loading branch information
Showing
4 changed files
with
214 additions
and
6 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> | ||
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" > | ||
<head> | ||
<title>YaCy '#[clientname]#': Crawl Start</title> | ||
#%env/templates/metas.template%# | ||
<script type="text/javascript" src="/js/ajax.js"></script> | ||
<script type="text/javascript" src="/js/IndexCreate.js"></script> | ||
<script type="text/javascript"> | ||
function check(key){ | ||
document.getElementById(key).checked = 'checked'; | ||
} | ||
</script> | ||
<style type="text/css"> | ||
.nobr { | ||
white-space: nowrap; | ||
} | ||
</style> | ||
</head> | ||
<body id="IndexCreate"> | ||
|
||
<div id="api"></div> | ||
|
||
#%env/templates/header.template%# | ||
#%env/templates/submenuIndexCreate.template%# | ||
<h2>Crawl Check</h2> | ||
|
||
<p>This pages gives you an analysis about the possible success for a web crawl on given addresses.</p> | ||
|
||
<fieldset> | ||
<legend> | ||
<label>Crawl Check</label> | ||
</legend> | ||
<form id="CrawlCheck" method="post" action="CrawlCheck_p.html" enctype="multipart/form-data" accept-charset="UTF-8"> | ||
<dl> | ||
<dt><label>List of possible crawl start URLs</label></dt> | ||
<dd> | ||
<textarea name="crawlingURLs" id="crawlingURLs" cols="80" rows="15" size="80">#[starturls]#</textarea> | ||
</dd> | ||
<dt></dt> | ||
<dd><input type="submit" name="crawlcheck" value="Check given urls" class="submitready"/> | ||
</dd> | ||
</dl> | ||
|
||
</form> | ||
</fieldset> | ||
|
||
|
||
#(table)#:: | ||
<fieldset><legend>Analysis</legend> | ||
<table border="0" cellpadding="2" cellspacing="2" style="float:left"> | ||
<tr> | ||
<th align="left" width="600" class="listing">URL</th> | ||
<th align="right" width="80" class="listing">Access</th> | ||
<th align="right" width="80" class="listing">Robots</th> | ||
<th align="right" width="80" class="listing">Crawl-Delay</th> | ||
<th align="right" width="200" class="listing">Sitemap</th> | ||
</tr> | ||
#{list}# | ||
<tr class="TableCell#(dark)#Light::Dark::Summary#(/dark)#"> | ||
<td align="left" nowrap class="listing"><a href="#[url]#" class="listing">#[url]#</a></td> | ||
<td align="right" nowrap class="listing">#[access]#</td> | ||
<td align="right" nowrap class="listing">#[robots]#</td> | ||
<td align="right" nowrap class="listing">#[crawldelay]#</td> | ||
<td align="right" nowrap class="listing">#[sitemap]#</td> | ||
</tr> | ||
#{/list}# | ||
</table> | ||
</fieldset> | ||
#(/table)# | ||
|
||
|
||
#%env/templates/footer.template%# | ||
</body> | ||
</html> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,132 @@ | ||
/** | ||
* CrawlCheck_p | ||
* Copyright 2012 by Michael Peter Christen | ||
* First released 10.10.2011 at http://yacy.net | ||
* | ||
* This library is free software; you can redistribute it and/or | ||
* modify it under the terms of the GNU Lesser General private | ||
* License as published by the Free Software Foundation; either | ||
* version 2.1 of the License, or (at your option) any later version. | ||
* | ||
* This library is distributed in the hope that it will be useful, | ||
* but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
* Lesser General Public License for more details. | ||
* | ||
* You should have received a copy of the GNU Lesser General Public License | ||
* along with this program in the file lgpl21.txt | ||
* If not, see <http://www.gnu.org/licenses/>. | ||
*/ | ||
|
||
import java.io.IOException; | ||
import java.net.MalformedURLException; | ||
import java.util.HashSet; | ||
import java.util.Set; | ||
import java.util.regex.Pattern; | ||
|
||
import net.yacy.cora.federate.yacy.CacheStrategy; | ||
import net.yacy.cora.protocol.RequestHeader; | ||
import net.yacy.crawler.data.CrawlQueues; | ||
import net.yacy.crawler.retrieval.Request; | ||
import net.yacy.crawler.retrieval.Response; | ||
import net.yacy.crawler.robots.RobotsTxtEntry; | ||
import net.yacy.kelondro.data.meta.DigestURI; | ||
import net.yacy.kelondro.logging.Log; | ||
import net.yacy.repository.Blacklist.BlacklistType; | ||
import net.yacy.search.Switchboard; | ||
import net.yacy.server.serverObjects; | ||
import net.yacy.server.serverSwitch; | ||
|
||
|
||
public class CrawlCheck_p { | ||
|
||
public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, final serverObjects post, final serverSwitch env) { | ||
final Switchboard sb = (Switchboard) env; | ||
final serverObjects prop = new serverObjects(); | ||
prop.put("starturls", ""); | ||
if (post == null) return prop; | ||
|
||
if (post.containsKey("crawlcheck")) { | ||
|
||
// get the list of rootURls for this crawl start | ||
Set<DigestURI> rootURLs = new HashSet<DigestURI>(); | ||
String crawlingStart0 = post.get("crawlingURLs","").trim(); | ||
String[] rootURLs0 = crawlingStart0.indexOf('\n') > 0 || crawlingStart0.indexOf('\r') > 0 ? crawlingStart0.split("[\\r\\n]+") : crawlingStart0.split(Pattern.quote("|")); | ||
for (String crawlingStart: rootURLs0) { | ||
if (crawlingStart == null || crawlingStart.length() == 0) continue; | ||
// add the prefix http:// if necessary | ||
int pos = crawlingStart.indexOf("://",0); | ||
if (pos == -1) { | ||
if (crawlingStart.startsWith("www")) crawlingStart = "http://" + crawlingStart; | ||
if (crawlingStart.startsWith("ftp")) crawlingStart = "ftp://" + crawlingStart; | ||
} | ||
try { | ||
DigestURI crawlingStartURL = new DigestURI(crawlingStart); | ||
rootURLs.add(crawlingStartURL); | ||
} catch (MalformedURLException e) { | ||
Log.logException(e); | ||
} | ||
} | ||
|
||
if (rootURLs.size() == 0) { | ||
prop.put("table", 0); | ||
} else { | ||
prop.put("table", 1); | ||
|
||
// make a string that is used to fill the starturls field again | ||
// and analyze the urls to make the table rows | ||
StringBuilder s = new StringBuilder(300); | ||
int row = 0; | ||
for (DigestURI u: rootURLs) { | ||
s.append(u.toNormalform(true, true)).append('\n'); | ||
prop.put("table_list_" + row + "_url", u.toNormalform(true, true)); | ||
|
||
// try to load the robots | ||
RobotsTxtEntry robotsEntry; | ||
boolean robotsAllowed = true; | ||
try { | ||
robotsEntry = sb.robots.getEntry(u, sb.peers.myBotIDs()); | ||
if (robotsEntry == null) { | ||
prop.put("table_list_" + row + "_robots", "no robots"); | ||
prop.put("table_list_" + row + "_crawldelay", CrawlQueues.queuedMinLoadDelay + " ms"); | ||
prop.put("table_list_" + row + "_sitemap", ""); | ||
} else { | ||
robotsAllowed = !robotsEntry.isDisallowed(u); | ||
prop.put("table_list_" + row + "_robots", "robots exist: " + (robotsAllowed ? "crawl allowed" : "url disallowed")); | ||
prop.put("table_list_" + row + "_crawldelay", Math.max(CrawlQueues.queuedMinLoadDelay, robotsEntry.getCrawlDelayMillis()) + " ms"); | ||
prop.put("table_list_" + row + "_sitemap", robotsEntry.getSitemap() == null ? "-" : robotsEntry.getSitemap().toNormalform(true, true)); | ||
} | ||
} catch (final IOException e) { | ||
} | ||
|
||
// try to load the url | ||
if (robotsAllowed) try { | ||
Request request = sb.loader.request(u, true, false); | ||
final Response response = sb.loader.load(request, CacheStrategy.NOCACHE, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay); | ||
if (response == null) { | ||
prop.put("table_list_" + row + "_access", "no response"); | ||
} else { | ||
if (response.getResponseHeader().getStatusCode() == 200) { | ||
prop.put("table_list_" + row + "_access", "200 ok, last-modified = " + response.lastModified()); | ||
} else { | ||
prop.put("table_list_" + row + "_access", response.getResponseHeader().getStatusCode() + " - load failed"); | ||
} | ||
} | ||
} catch (final IOException e) { | ||
prop.put("table_list_" + row + "_access", "error response: " + e.getMessage()); | ||
} else { | ||
prop.put("table_list_" + row + "_access", "not loaded - prevented by robots.txt"); | ||
} | ||
row++; | ||
|
||
} | ||
prop.put("table_list", row); | ||
prop.put("starturls", s.toString()); | ||
|
||
} | ||
} | ||
|
||
return prop; | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters