Skip to content

Commit

Permalink
fixed a number of small bugs:
Browse files Browse the repository at this point in the history
- better crawl star for files paths and smb paths
- added time-out wrapper for dns resolving and reverse resolving to prevent blockings
- fixed intranet scanner result list check boxes
- prevented htcache usage in case of file and smb crawling (not necessary, documents are locally available)
- fixed rss feed loader
- fixes sitemap loader which had not been restricted to single files (crawl-depth must be zero)
- clearing of crawl result lists when a network switch was done
- higher maximum file size for crawler

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7214 6c8d7289-2bf4-0310-a012-ef5d649a1542
  • Loading branch information
orbiter committed Sep 30, 2010
1 parent f6eebb6 commit 2c549ae
Show file tree
Hide file tree
Showing 22 changed files with 229 additions and 99 deletions.
4 changes: 2 additions & 2 deletions defaults/yacy.init
Expand Up @@ -674,10 +674,10 @@ crawler.clientTimeout=9000
crawler.http.acceptEncoding=gzip
crawler.http.acceptLanguage=en-us,en;q=0.5
crawler.http.acceptCharset=ISO-8859-1,utf-8;q=0.7,*;q=0.7
crawler.http.maxFileSize=1048576
crawler.http.maxFileSize=10485760

# ftp crawler specific settings; size in bytes
crawler.ftp.maxFileSize=1048576
crawler.ftp.maxFileSize=10485760

# smb crawler specific settings: maximum size
crawler.smb.maxFileSize=100000000
Expand Down
1 change: 0 additions & 1 deletion htroot/CrawlProfileEditor_p.java
Expand Up @@ -92,7 +92,6 @@ public eentry(final String name, final String label, final boolean readonly, fin
labels.add(new eentry(CrawlProfile.INDEX_TEXT, "Index Text", false, eentry.BOOLEAN));
labels.add(new eentry(CrawlProfile.INDEX_MEDIA, "Index Media", false, eentry.BOOLEAN));
labels.add(new eentry(CrawlProfile.STORE_HTCACHE, "Store in HTCache", false, eentry.BOOLEAN));
labels.add(new eentry(CrawlProfile.STORE_TXCACHE, "Store in TXCache", false, eentry.BOOLEAN));
labels.add(new eentry(CrawlProfile.REMOTE_INDEXING, "Remote Indexing", false, eentry.BOOLEAN));
labels.add(new eentry(CrawlProfile.XSSTOPW, "Static stop-words", false, eentry.BOOLEAN));
labels.add(new eentry(CrawlProfile.XDSTOPW, "Dynamic stop-words", false, eentry.BOOLEAN));
Expand Down
11 changes: 6 additions & 5 deletions htroot/CrawlResults.java
Expand Up @@ -188,15 +188,16 @@ public static serverObjects respond(final RequestHeader header, serverObjects po
try {
urle = sb.indexSegments.urlMetadata(Segments.Process.LOCALCRAWLING).load(entry.getKey().getBytes(), null, 0);
if (urle == null) {
Log.logWarning("PLASMA", "CrawlResults: URL not in index with url hash "+ entry.getKey());
Log.logWarning("PLASMA", "CrawlResults: URL not in index with url hash " + entry.getKey());
urlstr = null;
urltxt = null;
metadata = null;
} else {
metadata = urle.metadata();
urlstr = metadata.url().toNormalform(false, true);
urltxt = nxTools.shortenURLString(urlstr, 72); // shorten the string text like a URL
continue;
}
metadata = urle.metadata();
urlstr = metadata.url().toNormalform(false, true);
urltxt = nxTools.shortenURLString(urlstr, 72); // shorten the string text like a URL

initiatorSeed = entry.getValue() == null || entry.getValue().initiatorHash == null ? null : sb.peers.getConnected(new String(entry.getValue().initiatorHash));
executorSeed = entry.getValue() == null || entry.getValue().executorHash == null ? null : sb.peers.getConnected(new String(entry.getValue().executorHash));

Expand Down
2 changes: 1 addition & 1 deletion htroot/CrawlStartIntranet_p.html
Expand Up @@ -36,7 +36,7 @@ <h2>Intranet Crawl Start</h2>
#(/notintranet)#

#(servertable)#::
<form id="servertable" action="CrawlStartIntranet_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8" ><fieldset>
<form id="servertable" name="servertable" action="CrawlStartIntranet_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8" ><fieldset>
<legend><label for="servertable">Available Intranet Server</label></legend>
<table class="sortable" border="0" cellpadding="2" cellspacing="1">
<tr class="TableHeader" valign="bottom">
Expand Down
3 changes: 1 addition & 2 deletions htroot/CrawlStartSite_p.html
Expand Up @@ -45,8 +45,7 @@ <h2>Site Crawling</h2>
<span id="robotsOK"></span><img align="top" src="/env/grafics/empty.gif" name="ajax" alt="empty" />
</td>
</tr><tr>
<td><input type="radio" name="crawlingMode" id="sitelist" value="sitelist" disabled="disabled"
onmousedown="document.getElementById('rangeDomain').disabled=true;document.getElementById('rangeSubpath').disabled=true;document.getElementById('crawlingDomMaxCheck').disabled=true;document.getElementById('crawlingDomMaxPages').disabled=true;document.getElementById('crawlingQ').disabled=true;"/>Link-List of URL</td>
<td><input type="radio" name="crawlingMode" id="sitelist" value="sitelist" disabled="disabled" />Link-List of URL</td>
<td><div id="sitelistURLs"></div></td>
</tr><tr>
<td><input type="radio" name="crawlingMode" id="sitemap" value="sitemap" disabled="disabled"
Expand Down
57 changes: 36 additions & 21 deletions htroot/Crawler_p.java
Expand Up @@ -138,14 +138,13 @@ public static serverObjects respond(final RequestHeader header, final serverObje
final boolean fullDomain = post.get("range", "wide").equals("domain"); // special property in simple crawl start
final boolean subPath = post.get("range", "wide").equals("subpath"); // special property in simple crawl start


// set the crawl filter
String newcrawlingMustMatch = post.get("mustmatch", CrawlProfile.MATCH_ALL);
String newcrawlingMustNotMatch = post.get("mustnotmatch", CrawlProfile.MATCH_NEVER);
if (newcrawlingMustMatch.length() < 2) newcrawlingMustMatch = CrawlProfile.MATCH_ALL; // avoid that all urls are filtered out if bad value was submitted
// special cases:
if (crawlingStartURL!= null && fullDomain) {
newcrawlingMustMatch = ".*" + crawlingStartURL.getHost() + ".*";
newcrawlingMustMatch = crawlingStartURL.isFile() ? "file:///.*" : crawlingStartURL.isSMB() ? "smb://.*" : ".*" + crawlingStartURL.getHost() + ".*";
}
if (crawlingStart!= null && subPath && (pos = crawlingStart.lastIndexOf('/')) > 0) {
newcrawlingMustMatch = crawlingStart.substring(0, pos + 1) + ".*";
Expand Down Expand Up @@ -203,7 +202,8 @@ public static serverObjects respond(final RequestHeader header, final serverObje
final boolean indexMedia = post.get("indexMedia", "off").equals("on");
env.setConfig("indexMedia", (indexMedia) ? "true" : "false");

final boolean storeHTCache = post.get("storeHTCache", "off").equals("on");
boolean storeHTCache = post.get("storeHTCache", "off").equals("on");
if (crawlingStartURL.isFile() || crawlingStartURL.isSMB()) storeHTCache = false;
env.setConfig("storeHTCache", (storeHTCache) ? "true" : "false");

final String cachePolicyString = post.get("cachePolicy", "iffresh");
Expand Down Expand Up @@ -247,15 +247,21 @@ public static serverObjects respond(final RequestHeader header, final serverObje
// stack url
sb.crawler.profilesPassiveCrawls.remove(crawlingStartURL.hash()); // if there is an old entry, delete it
final CrawlProfile pe = new CrawlProfile(
(crawlingStartURL.getHost() == null) ? Long.toHexString(System.currentTimeMillis()) : crawlingStartURL.getHost(),
(crawlingStartURL.getHost() == null) ? crawlingStartURL.toNormalform(true, false) : crawlingStartURL.getHost(),
crawlingStartURL,
newcrawlingMustMatch,
newcrawlingMustNotMatch,
newcrawlingdepth,
crawlingIfOlder, crawlingDomMaxPages,
crawlingIfOlder,
crawlingDomMaxPages,
crawlingQ,
indexText, indexMedia,
storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw, cachePolicy);
storeHTCache,
crawlOrder,
xsstopw,
xdstopw,
xpstopw,
cachePolicy);
sb.crawler.profilesActiveCrawls.put(pe.handle().getBytes(), pe);
final String reasonString = sb.crawlStacker.stackCrawl(new Request(
sb.peers.mySeed().hash.getBytes(),
Expand Down Expand Up @@ -352,7 +358,8 @@ public static serverObjects respond(final RequestHeader header, final serverObje
final Map<MultiProtocolURI, String> hyperlinks = scraper.getAnchors();
final DigestURI crawlURL = new DigestURI("file://" + file.toString(), null);
final CrawlProfile profile = new CrawlProfile(
fileName, crawlURL,
fileName,
crawlURL,
newcrawlingMustMatch,
CrawlProfile.MATCH_NEVER,
newcrawlingdepth,
Expand All @@ -362,9 +369,10 @@ public static serverObjects respond(final RequestHeader header, final serverObje
indexText,
indexMedia,
storeHTCache,
true,
crawlOrder,
xsstopw, xdstopw, xpstopw,
xsstopw,
xdstopw,
xpstopw,
cachePolicy);
sb.crawler.profilesActiveCrawls.put(profile.handle().getBytes(), profile);
sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
Expand Down Expand Up @@ -405,15 +413,21 @@ public static serverObjects respond(final RequestHeader header, final serverObje
try {
final DigestURI sitemapURL = new DigestURI(sitemapURLStr, null);
final CrawlProfile pe = new CrawlProfile(
sitemapURLStr, sitemapURL,
newcrawlingMustMatch,
sitemapURLStr,
sitemapURL,
CrawlProfile.MATCH_ALL,
CrawlProfile.MATCH_NEVER,
newcrawlingdepth,
crawlingIfOlder, crawlingDomMaxPages,
crawlingQ,
indexText, indexMedia,
storeHTCache, true, crawlOrder,
xsstopw, xdstopw, xpstopw,
0,
crawlingIfOlder,
crawlingDomMaxPages,
true,
indexText,
indexMedia,
storeHTCache,
crawlOrder,
xsstopw,
xdstopw,
xpstopw,
cachePolicy);
sb.crawler.profilesActiveCrawls.put(pe.handle().getBytes(), pe);
final SitemapImporter importer = new SitemapImporter(sb, sitemapURL, pe);
Expand All @@ -431,7 +445,7 @@ public static serverObjects respond(final RequestHeader header, final serverObje
// download document
ContentScraper scraper = null;
scraper = sb.loader.parseResource(sitelistURL, CrawlProfile.CacheStrategy.IFFRESH);
String title = scraper.getTitle();
// String title = scraper.getTitle();
// String description = scraper.getDescription();

// get links and generate filter
Expand All @@ -444,7 +458,7 @@ public static serverObjects respond(final RequestHeader header, final serverObje

// put links onto crawl queue
final CrawlProfile profile = new CrawlProfile(
title == null || title.length() == 0 ? sitelistURL.getHost() : title,
sitelistURL.getHost(),
sitelistURL,
newcrawlingMustMatch,
CrawlProfile.MATCH_NEVER,
Expand All @@ -455,9 +469,10 @@ public static serverObjects respond(final RequestHeader header, final serverObje
indexText,
indexMedia,
storeHTCache,
true,
crawlOrder,
xsstopw, xdstopw, xpstopw,
xsstopw,
xdstopw,
xpstopw,
cachePolicy);
sb.crawler.profilesActiveCrawls.put(profile.handle().getBytes(), profile);
sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
Expand Down
1 change: 0 additions & 1 deletion htroot/QuickCrawlLink_p.java
Expand Up @@ -157,7 +157,6 @@ public static serverObjects respond(final RequestHeader header, final serverObje
indexText,
indexMedia,
storeHTCache,
true,
remoteIndexing,
xsstopw,
xdstopw,
Expand Down
3 changes: 2 additions & 1 deletion htroot/SettingsAck_p.java
Expand Up @@ -36,6 +36,7 @@
import java.util.regex.PatternSyntaxException;

import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.kelondro.order.Base64Order;
import net.yacy.kelondro.order.Digest;
Expand Down Expand Up @@ -111,7 +112,7 @@ public static serverObjects respond(final RequestHeader header, final serverObje
final serverCore theServerCore = (serverCore) env.getThread("10_httpd");
try {
final InetSocketAddress theNewAddress = theServerCore.generateSocketAddress(port);
final String hostName = theNewAddress.getHostName();
final String hostName = Domains.getHostName(theNewAddress.getAddress());
prop.put("info_restart", "1");
prop.put("info_restart_ip",(hostName.equals("0.0.0.0"))? "localhost" : hostName);
prop.put("info_restart_port", theNewAddress.getPort());
Expand Down
6 changes: 4 additions & 2 deletions htroot/api/util/getpageinfo_p.java
Expand Up @@ -26,11 +26,13 @@ public static serverObjects respond(final RequestHeader header, final serverObje
prop.put("robots-allowed", "3"); //unknown
prop.put("sitemap", "");
prop.put("favicon","");
prop.put("sitelist", "");
prop.put("filter", ".*");

// default actions
String actions="title,robots";

if(post!=null && post.containsKey("url")){
if (post != null && post.containsKey("url")) {
if(post.containsKey("actions"))
actions=post.get("actions");
String url=post.get("url");
Expand Down Expand Up @@ -97,7 +99,7 @@ public static serverObjects respond(final RequestHeader header, final serverObje
prop.putXML("filter", filter.length() > 0 ? filter.substring(1) : ".*");
}
}
if(actions.indexOf("robots")>=0){
if (actions.indexOf("robots")>=0) {
try {
final DigestURI theURL = new DigestURI(url, null);

Expand Down
4 changes: 2 additions & 2 deletions htroot/js/IndexCreate.js
Expand Up @@ -48,14 +48,14 @@ function handleResponse(){
sitemap=response.getElementsByTagName("sitemap")[0].firstChild.nodeValue;
}
document.getElementsByName("sitemapURL")[0].value=sitemap;
document.getElementById("sitemap").disabled=false;
if (sitemap) document.getElementById("sitemap").disabled=false;
}
sitelist="";
if (response.getElementsByTagName("sitelist")[0].firstChild!=null){
sitelist=response.getElementsByTagName("sitelist")[0].firstChild.nodeValue;
}
document.getElementById("sitelistURLs").innerHTML = sitelist;
document.getElementById("sitelist").disabled=false;
if (sitelist) document.getElementById("sitelist").disabled=false;

// clear the ajax image
document.getElementsByName("ajax")[0].setAttribute("src", AJAX_OFF);
Expand Down
20 changes: 9 additions & 11 deletions source/de/anomic/crawler/CrawlProfile.java
Expand Up @@ -53,7 +53,6 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
public static final String INDEX_TEXT = "indexText";
public static final String INDEX_MEDIA = "indexMedia";
public static final String STORE_HTCACHE = "storeHTCache";
public static final String STORE_TXCACHE = "storeTXCache";
public static final String REMOTE_INDEXING = "remoteIndexing";
public static final String XSSTOPW = "xsstopw";
public static final String XDSTOPW = "xdstopw";
Expand All @@ -64,17 +63,22 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
private Pattern mustmatch = null, mustnotmatch = null;


public CrawlProfile(final String name, final DigestURI startURL,
public CrawlProfile(
final String name,
final DigestURI startURL,
final String mustmatch,
final String mustnotmatch,
final int depth,
final long recrawlIfOlder /*date*/,
final int domMaxPages,
final boolean crawlingQ,
final boolean indexText, final boolean indexMedia,
final boolean storeHTCache, final boolean storeTXCache,
final boolean indexText,
final boolean indexMedia,
final boolean storeHTCache,
final boolean remoteIndexing,
final boolean xsstopw, final boolean xdstopw, final boolean xpstopw,
final boolean xsstopw,
final boolean xdstopw,
final boolean xpstopw,
final CacheStrategy cacheStrategy) {
super(40);
if (name == null || name.length() == 0) throw new NullPointerException("name must not be null");
Expand All @@ -91,7 +95,6 @@ public CrawlProfile(final String name, final DigestURI startURL,
put(INDEX_TEXT, indexText);
put(INDEX_MEDIA, indexMedia);
put(STORE_HTCACHE, storeHTCache);
put(STORE_TXCACHE, storeTXCache);
put(REMOTE_INDEXING, remoteIndexing);
put(XSSTOPW, xsstopw); // exclude static stop-words
put(XDSTOPW, xdstopw); // exclude dynamic stop-word
Expand Down Expand Up @@ -218,11 +221,6 @@ public boolean storeHTCache() {
if (r == null) return false;
return (r.equals(Boolean.TRUE.toString()));
}
public boolean storeTXCache() {
final String r = get(STORE_TXCACHE);
if (r == null) return false;
return (r.equals(Boolean.TRUE.toString()));
}
public boolean remoteIndexing() {
final String r = get(REMOTE_INDEXING);
if (r == null) return false;
Expand Down

0 comments on commit 2c549ae

Please sign in to comment.