From 2802138787145753653286990f7f3c923630ab93 Mon Sep 17 00:00:00 2001 From: orbiter Date: Mon, 15 Dec 2008 00:02:58 +0000 Subject: [PATCH] - refactoring of CrawlStacker (to prepare it for new multi-Threading to remove DNS lookup bottleneck) - fix of shallBeOwnWord target computation heuristic git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5392 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/ConfigAccounts_p.java | 2 +- htroot/IndexTransfer_p.java | 6 +- htroot/rct_p.java | 2 +- htroot/yacy/crawlReceipt.java | 2 +- htroot/yacy/transferURL.java | 2 +- source/de/anomic/crawler/CrawlQueues.java | 2 +- source/de/anomic/crawler/CrawlStacker.java | 337 +++++++----------- source/de/anomic/plasma/plasmaSearchAPI.java | 2 +- .../de/anomic/plasma/plasmaSwitchboard.java | 51 +-- source/de/anomic/yacy/yacyClient.java | 2 +- source/de/anomic/yacy/yacyPeerSelection.java | 36 +- source/de/anomic/yacy/yacySeed.java | 40 ++- 12 files changed, 208 insertions(+), 276 deletions(-) diff --git a/htroot/ConfigAccounts_p.java b/htroot/ConfigAccounts_p.java index 64d5b155df..35af08fb7c 100644 --- a/htroot/ConfigAccounts_p.java +++ b/htroot/ConfigAccounts_p.java @@ -62,7 +62,7 @@ public static serverObjects respond(final httpRequestHeader header, final server } if (localhostAccess) { - if (sb.acceptLocalURLs) { + if (sb.crawlStacker.acceptLocalURLs()) { // in this case it is not allowed to use a localhostAccess option prop.put("commitIntranetWarning", 1); localhostAccess = false; diff --git a/htroot/IndexTransfer_p.java b/htroot/IndexTransfer_p.java index 57b6c4e3a5..7f88fd5702 100644 --- a/htroot/IndexTransfer_p.java +++ b/htroot/IndexTransfer_p.java @@ -105,14 +105,12 @@ public static serverObjects respond(final httpRequestHeader header, final server } else { if (!prop.containsKey("running_status")) prop.put("running_status","Not running"); } - - - + //List known hosts yacySeed seed; int hc = 0; if ((sb.webIndex.seedDB != null) && (sb.webIndex.seedDB.sizeConnected() > 0)) { - final Iterator e = yacyPeerSelection.getAcceptRemoteIndexSeeds(sb.webIndex.seedDB, "AAAAAAAAAAAA", sb.webIndex.seedDB.sizeConnected()); + final Iterator e = yacyPeerSelection.getAcceptRemoteIndexSeeds(sb.webIndex.seedDB, null, sb.webIndex.seedDB.sizeConnected(), false); final TreeMap hostList = new TreeMap(); while (e.hasNext()) { seed = e.next(); diff --git a/htroot/rct_p.java b/htroot/rct_p.java index e056ec725e..cb2e2942dc 100644 --- a/htroot/rct_p.java +++ b/htroot/rct_p.java @@ -72,7 +72,7 @@ public static serverObjects respond(final httpRequestHeader header, final server loaddate = new Date(); } final yacyURL referrer = null; // referrer needed! - final String urlRejectReason = sb.acceptURL(url); + final String urlRejectReason = sb.crawlStacker.urlInAcceptedDomain(url); if (urlRejectReason == null) { // stack url if (sb.getLog().isFinest()) sb.getLog().logFinest("crawlOrder: stack: url='" + url + "'"); diff --git a/htroot/yacy/crawlReceipt.java b/htroot/yacy/crawlReceipt.java index 8d7997d33e..489d38b3e7 100644 --- a/htroot/yacy/crawlReceipt.java +++ b/htroot/yacy/crawlReceipt.java @@ -127,7 +127,7 @@ public static serverObjects respond(final httpRequestHeader header, final server } // check if the entry is in our network domain - final String urlRejectReason = sb.acceptURL(comp.url()); + final String urlRejectReason = sb.crawlStacker.urlInAcceptedDomain(comp.url()); if (urlRejectReason != null) { log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT (" + urlRejectReason + ") for hash " + entry.hash() + " from peer " + iam + "\n\tURL properties: "+ propStr); prop.put("delay", "9999"); diff --git a/htroot/yacy/transferURL.java b/htroot/yacy/transferURL.java index b7d1efc23e..c2a2f84769 100644 --- a/htroot/yacy/transferURL.java +++ b/htroot/yacy/transferURL.java @@ -128,7 +128,7 @@ public static serverObjects respond(final httpRequestHeader header, final server } // check if the entry is in our network domain - final String urlRejectReason = sb.acceptURL(comp.url()); + final String urlRejectReason = sb.crawlStacker.urlInAcceptedDomain(comp.url()); if (urlRejectReason != null) { if (yacyCore.log.isFine()) yacyCore.log.logFine("transferURL: blocked URL '" + comp.url() + "' (" + urlRejectReason + ") from peer " + otherPeerName); lEntry = null; diff --git a/source/de/anomic/crawler/CrawlQueues.java b/source/de/anomic/crawler/CrawlQueues.java index 2be78d90f1..9e2e0621e7 100644 --- a/source/de/anomic/crawler/CrawlQueues.java +++ b/source/de/anomic/crawler/CrawlQueues.java @@ -393,7 +393,7 @@ public boolean remoteCrawlLoaderJob() { } catch (final ParseException e) { loaddate = new Date(); } - final String urlRejectReason = sb.acceptURL(url); + final String urlRejectReason = sb.crawlStacker.urlInAcceptedDomain(url); if (urlRejectReason == null) { // stack url if (sb.getLog().isFinest()) sb.getLog().logFinest("crawlOrder: stack: url='" + url + "'"); diff --git a/source/de/anomic/crawler/CrawlStacker.java b/source/de/anomic/crawler/CrawlStacker.java index 660ed7fe70..4271b42413 100644 --- a/source/de/anomic/crawler/CrawlStacker.java +++ b/source/de/anomic/crawler/CrawlStacker.java @@ -28,151 +28,73 @@ package de.anomic.crawler; -import java.io.File; import java.io.IOException; import java.net.UnknownHostException; import java.util.ArrayList; import java.util.Date; -import java.util.Iterator; import java.util.LinkedList; import de.anomic.index.indexReferenceBlacklist; import de.anomic.index.indexURLReference; -import de.anomic.kelondro.kelondroCache; -import de.anomic.kelondro.kelondroEcoTable; -import de.anomic.kelondro.kelondroException; import de.anomic.kelondro.kelondroIndex; import de.anomic.kelondro.kelondroRow; import de.anomic.kelondro.kelondroRowSet; -import de.anomic.kelondro.kelondroTree; import de.anomic.plasma.plasmaSwitchboard; +import de.anomic.plasma.plasmaWordIndex; import de.anomic.server.serverDomains; import de.anomic.server.logging.serverLog; import de.anomic.yacy.yacyURL; -public final class CrawlStacker extends Thread { - - private static final int EcoFSBufferSize = 20; - private static String stackfile = "urlNoticeStacker9.db"; - - // keys for different database types - public static final int QUEUE_DB_TYPE_RAM = 0; - public static final int QUEUE_DB_TYPE_TREE = 1; - public static final int QUEUE_DB_TYPE_ECO = 2; +public final class CrawlStacker { final serverLog log = new serverLog("STACKCRAWL"); - private final plasmaSwitchboard sb; - private final LinkedList urlEntryHashCache; - private kelondroIndex urlEntryCache; - private final File cacheStacksPath; - private final int dbtype; - private final boolean prequeue; - private long dnsHit, dnsMiss; - private int alternateCount; - + private final LinkedList urlEntryHashCache; // the order how this queue is processed; entries with known DNS entries go first + private kelondroIndex urlEntryCache; // the entries in the queue + private long dnsHit, dnsMiss; + private int alternateCount; + private CrawlQueues nextQueue; + private plasmaWordIndex wordIndex; + private boolean acceptLocalURLs, acceptGlobalURLs; // objects for the prefetch task private final ArrayList dnsfetchHosts = new ArrayList(); - public CrawlStacker(final plasmaSwitchboard sb, final File dbPath, final int dbtype, final boolean prequeue) { - this.sb = sb; - this.prequeue = prequeue; + + // this is the process that checks url for double-occurrences and for allowance/disallowance by robots.txt + + public CrawlStacker(CrawlQueues cq, plasmaWordIndex wordIndex, boolean acceptLocalURLs, boolean acceptGlobalURLs) { + this.nextQueue = cq; + this.wordIndex = wordIndex; this.dnsHit = 0; this.dnsMiss = 0; this.alternateCount = 0; + this.acceptLocalURLs = acceptLocalURLs; + this.acceptGlobalURLs = acceptGlobalURLs; // init the message list this.urlEntryHashCache = new LinkedList(); - - // create a stack for newly entered entries - this.cacheStacksPath = dbPath; - this.dbtype = dbtype; - - openDB(); - try { - // loop through the list and fill the messageList with url hashs - final Iterator rows = this.urlEntryCache.rows(true, null); - kelondroRow.Entry entry; - while (rows.hasNext()) { - entry = rows.next(); - if (entry == null) { - System.out.println("ERROR! null element found"); - continue; - } - this.urlEntryHashCache.add(entry.getColString(0, null)); - } - } catch (final kelondroException e) { - /* if we have an error, we start with a fresh database */ - CrawlStacker.this.log.logSevere("Unable to initialize crawl stacker queue, kelondroException:" + e.getMessage() + ". Reseting DB.\n", e); - - // deleting old db and creating a new db - try {this.urlEntryCache.close();} catch (final Exception ex) {} - deleteDB(); - openDB(); - } catch (final IOException e) { - /* if we have an error, we start with a fresh database */ - CrawlStacker.this.log.logSevere("Unable to initialize crawl stacker queue, IOException:" + e.getMessage() + ". Reseting DB.\n", e); - // deleting old db and creating a new db - try {this.urlEntryCache.close();} catch (final Exception ex) {} - deleteDB(); - openDB(); - } - this.log.logInfo(size() + " entries in the stackCrawl queue."); - this.start(); // start the prefetcher thread + this.urlEntryCache = new kelondroRowSet(CrawlEntry.rowdef, 0); this.log.logInfo("STACKCRAWL thread initialized."); } - public void run() { - String nextHost; - try { - while (!Thread.currentThread().isInterrupted()) { // action loop - if (dnsfetchHosts.size() == 0) synchronized (this) { wait(); } - synchronized (dnsfetchHosts) { - nextHost = dnsfetchHosts.remove(dnsfetchHosts.size() - 1); - } - try { - serverDomains.dnsResolve(nextHost); - } catch (final Exception e) {} - } - } catch (final InterruptedException e) {} - } - - public boolean prefetchHost(final String host) { - // returns true when the host was known in the dns cache. - // If not, the host is stacked on the fetch stack and false is returned - try { - serverDomains.dnsResolveFromCache(host); - return true; - } catch (final UnknownHostException e) { - synchronized (this) { - dnsfetchHosts.add(host); - notifyAll(); - } - return false; - } - } - - public void terminateDNSPrefetcher() { - synchronized (this) { - interrupt(); + public int size() { + synchronized (this.urlEntryHashCache) { + return this.urlEntryHashCache.size(); } } - + public void clear() throws IOException { this.urlEntryHashCache.clear(); this.urlEntryCache.clear(); } public void close() { - if (this.dbtype == QUEUE_DB_TYPE_RAM) { - this.log.logInfo("Shutdown. Flushing remaining " + size() + " crawl stacker job entries. please wait."); - while (size() > 0) { - if (!job()) break; - } + this.log.logInfo("Shutdown. Flushing remaining " + size() + " crawl stacker job entries. please wait."); + while (size() > 0) { + if (!job()) break; } - terminateDNSPrefetcher(); this.log.logInfo("Shutdown. Closing stackCrawl queue."); @@ -182,26 +104,68 @@ public void close() { // clearing the hash list this.urlEntryHashCache.clear(); } + + private boolean prefetchHost(final String host) { + // returns true when the host was known in the dns cache. + // If not, the host is stacked on the fetch stack and false is returned + try { + serverDomains.dnsResolveFromCache(host); + return true; + } catch (final UnknownHostException e) { + synchronized (this) { + dnsfetchHosts.add(host); + notifyAll(); + } + return false; + } + } public boolean job() { - CrawlEntry entry; + // this is the method that is called by the busy thread from outside + if (this.urlEntryHashCache.size() == 0) return false; + + // get the next entry from the queue + String urlHash = null; + kelondroRow.Entry ec = null; + synchronized (this.urlEntryHashCache) { + urlHash = this.urlEntryHashCache.removeFirst(); + if (urlHash == null) { + urlEntryHashCache.clear(); + try { + urlEntryCache.clear(); + } catch (IOException e) { + e.printStackTrace(); + } + return false; + } + try { + ec = this.urlEntryCache.remove(urlHash.getBytes()); + } catch (IOException e) { + e.printStackTrace(); + return false; + } + } + if (urlHash == null || ec == null) return false; + + // make a crawl Entry out of it + CrawlEntry entry = null; try { - entry = dequeueEntry(); - } catch (final IOException e) { - e.printStackTrace(); + entry = new CrawlEntry(ec); + } catch (IOException e1) { + e1.printStackTrace(); return false; } + if (entry == null) return false; try { - - final String rejectReason = sb.crawlStacker.stackCrawl(entry); + final String rejectReason = stackCrawl(entry); // if the url was rejected we store it into the error URL db if (rejectReason != null) { - final ZURL.Entry ee = sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.seedDB.mySeed().hash, new Date(), 1, rejectReason); + final ZURL.Entry ee = nextQueue.errorURL.newEntry(entry, wordIndex.seedDB.mySeed().hash, new Date(), 1, rejectReason); ee.store(); - sb.crawlQueues.errorURL.push(ee); + nextQueue.errorURL.push(ee); } } catch (final Exception e) { CrawlStacker.this.log.logWarning("Error while processing stackCrawl entry.\n" + "Entry: " + entry.toString() + "Error: " + e.toString(), e); @@ -270,8 +234,6 @@ public void enqueueEntry( synchronized(this.urlEntryHashCache) { kelondroRow.Entry oldValue; - boolean hostknown = true; - if (prequeue) hostknown = prefetchHost(nexturl.getHost()); try { oldValue = this.urlEntryCache.put(newEntryRow); } catch (final IOException e) { @@ -279,7 +241,7 @@ public void enqueueEntry( } if (oldValue == null) { //System.out.println("*** debug crawlStacker dnsHit=" + this.dnsHit + ", dnsMiss=" + this.dnsMiss + ", alternateCount=" + this.alternateCount + ((this.dnsMiss > 0) ? (", Q=" + (this.dnsHit / this.dnsMiss)) : "")); - if (hostknown) { + if (prefetchHost(nexturl.getHost())) { this.alternateCount++; this.urlEntryHashCache.addFirst(newEntry.url().hash()); this.dnsHit++; @@ -297,79 +259,9 @@ public void enqueueEntry( } } - private void deleteDB() { - if (this.dbtype == QUEUE_DB_TYPE_RAM) { - // do nothing.. - return; - } - if (this.dbtype == QUEUE_DB_TYPE_ECO) { - new File(cacheStacksPath, stackfile).delete(); - //kelondroFlexWidthArray.delete(cacheStacksPath, stackfile); - } - if (this.dbtype == QUEUE_DB_TYPE_TREE) { - final File cacheFile = new File(cacheStacksPath, stackfile); - cacheFile.delete(); - } - } - - private void openDB() { - if (!(cacheStacksPath.exists())) cacheStacksPath.mkdir(); // make the path - - if (this.dbtype == QUEUE_DB_TYPE_RAM) { - this.urlEntryCache = new kelondroRowSet(CrawlEntry.rowdef, 0); - } - if (this.dbtype == QUEUE_DB_TYPE_ECO) { - cacheStacksPath.mkdirs(); - final File f = new File(cacheStacksPath, stackfile); - try { - this.urlEntryCache = new kelondroEcoTable(f, CrawlEntry.rowdef, kelondroEcoTable.tailCacheUsageAuto, EcoFSBufferSize, 0); - //this.urlEntryCache = new kelondroCache(new kelondroFlexTable(cacheStacksPath, newCacheName, preloadTime, CrawlEntry.rowdef, 0, true)); - } catch (final Exception e) { - e.printStackTrace(); - // kill DB and try again - f.delete(); - //kelondroFlexTable.delete(cacheStacksPath, newCacheName); - try { - this.urlEntryCache = new kelondroEcoTable(f, CrawlEntry.rowdef, kelondroEcoTable.tailCacheUsageAuto, EcoFSBufferSize, 0); - //this.urlEntryCache = new kelondroCache(new kelondroFlexTable(cacheStacksPath, newCacheName, preloadTime, CrawlEntry.rowdef, 0, true)); - } catch (final Exception ee) { - ee.printStackTrace(); - System.exit(-1); - } - } - } - if (this.dbtype == QUEUE_DB_TYPE_TREE) { - final File cacheFile = new File(cacheStacksPath, stackfile); - cacheFile.getParentFile().mkdirs(); - this.urlEntryCache = new kelondroCache(kelondroTree.open(cacheFile, true, 0, CrawlEntry.rowdef)); - } - } - - public int size() { - synchronized (this.urlEntryHashCache) { - return this.urlEntryHashCache.size(); - } - } - - public int getDBType() { - return this.dbtype; - } - - public CrawlEntry dequeueEntry() throws IOException { - if (this.urlEntryHashCache.size() == 0) return null; - String urlHash = null; - kelondroRow.Entry entry = null; - synchronized (this.urlEntryHashCache) { - urlHash = this.urlEntryHashCache.removeFirst(); - if (urlHash == null) throw new IOException("urlHash is null"); - entry = this.urlEntryCache.remove(urlHash.getBytes()); - } - - if ((urlHash == null) || (entry == null)) return null; - return new CrawlEntry(entry); - } - public String stackCrawl(final CrawlEntry entry) { + + private String stackCrawl(final CrawlEntry entry) { // stacks a crawl item. The position can also be remote // returns null if successful, a reason string if not successful //this.log.logFinest("stackCrawl: nexturlString='" + nexturlString + "'"); @@ -379,7 +271,7 @@ public String stackCrawl(final CrawlEntry entry) { // check if the protocol is supported final String urlProtocol = entry.url().getProtocol(); - if (!sb.crawlQueues.isSupportedProtocol(urlProtocol)) { + if (!nextQueue.isSupportedProtocol(urlProtocol)) { reason = "unsupported protocol"; this.log.logSevere("Unsupported protocol in URL '" + entry.url().toString() + "'. " + "Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms"); @@ -387,9 +279,9 @@ public String stackCrawl(final CrawlEntry entry) { } // check if ip is local ip address - final String urlRejectReason = sb.acceptURL(entry.url()); + final String urlRejectReason = urlInAcceptedDomain(entry.url()); if (urlRejectReason != null) { - reason = "denied_(" + urlRejectReason + ")_domain=" + sb.getConfig("network.unit.domain", "unknown"); + reason = "denied_(" + urlRejectReason + ")"; if (this.log.isFine()) this.log.logFine(reason + "Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms"); return reason; } @@ -402,7 +294,7 @@ public String stackCrawl(final CrawlEntry entry) { return reason; } - final CrawlProfile.entry profile = sb.webIndex.profilesActiveCrawls.getEntry(entry.profileHandle()); + final CrawlProfile.entry profile = wordIndex.profilesActiveCrawls.getEntry(entry.profileHandle()); if (profile == null) { final String errorMsg = "LOST STACKER PROFILE HANDLE '" + entry.profileHandle() + "' for URL " + entry.url(); log.logWarning(errorMsg); @@ -443,7 +335,7 @@ public String stackCrawl(final CrawlEntry entry) { return reason; } - final yacyURL referrerURL = (entry.referrerhash() == null) ? null : sb.crawlQueues.getURL(entry.referrerhash()); + final yacyURL referrerURL = (entry.referrerhash() == null) ? null : nextQueue.getURL(entry.referrerhash()); // add domain to profile domain list if ((profile.domFilterDepth() != Integer.MAX_VALUE) || (profile.domMaxPages() != Integer.MAX_VALUE)) { @@ -467,8 +359,8 @@ public String stackCrawl(final CrawlEntry entry) { } // check if the url is double registered - final String dbocc = sb.crawlQueues.urlExists(entry.url().hash()); - final indexURLReference oldEntry = this.sb.webIndex.getURL(entry.url().hash(), null, 0); + final String dbocc = nextQueue.urlExists(entry.url().hash()); + final indexURLReference oldEntry = wordIndex.getURL(entry.url().hash(), null, 0); final boolean recrawl = (oldEntry != null) && (profile.recrawlIfOlder() > oldEntry.loaddate().getTime()); // do double-check if ((dbocc != null) && (!recrawl)) { @@ -489,16 +381,16 @@ public String stackCrawl(final CrawlEntry entry) { } // store information - final boolean local = entry.initiator().equals(sb.webIndex.seedDB.mySeed().hash); - final boolean proxy = (entry.initiator() == null || entry.initiator().equals("------------")) && profile.handle().equals(this.sb.webIndex.defaultProxyProfile.handle()); - final boolean remote = profile.handle().equals(this.sb.webIndex.defaultRemoteProfile.handle()); + final boolean local = entry.initiator().equals(wordIndex.seedDB.mySeed().hash); + final boolean proxy = (entry.initiator() == null || entry.initiator().equals("------------")) && profile.handle().equals(wordIndex.defaultProxyProfile.handle()); + final boolean remote = profile.handle().equals(wordIndex.defaultRemoteProfile.handle()); final boolean global = (profile.remoteIndexing()) /* granted */ && (entry.depth() == profile.depth()) /* leaf node */ && //(initiatorHash.equals(yacyCore.seedDB.mySeed.hash)) /* not proxy */ && ( - (sb.webIndex.seedDB.mySeed().isSenior()) || - (sb.webIndex.seedDB.mySeed().isPrincipal()) + (wordIndex.seedDB.mySeed().isSenior()) || + (wordIndex.seedDB.mySeed().isPrincipal()) ) /* qualified */; if (!local && !global && !remote && !proxy) { @@ -508,23 +400,62 @@ public String stackCrawl(final CrawlEntry entry) { // it may be possible that global == true and local == true, so do not check an error case against it if (proxy) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: global = true, proxy = true, initiator = " + entry.initiator() + ", profile.handle = " + profile.handle()); if (remote) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: global = true, remote = true, initiator = " + entry.initiator() + ", profile.handle = " + profile.handle()); - sb.crawlQueues.noticeURL.push(NoticedURL.STACK_TYPE_LIMIT, entry); + nextQueue.noticeURL.push(NoticedURL.STACK_TYPE_LIMIT, entry); } if (local) { if (proxy) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: local = true, proxy = true, initiator = " + entry.initiator() + ", profile.handle = " + profile.handle()); if (remote) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: local = true, remote = true, initiator = " + entry.initiator() + ", profile.handle = " + profile.handle()); - sb.crawlQueues.noticeURL.push(NoticedURL.STACK_TYPE_CORE, entry); + nextQueue.noticeURL.push(NoticedURL.STACK_TYPE_CORE, entry); } if (proxy) { if (remote) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: proxy = true, remote = true, initiator = " + entry.initiator() + ", profile.handle = " + profile.handle()); - sb.crawlQueues.noticeURL.push(NoticedURL.STACK_TYPE_CORE, entry); + nextQueue.noticeURL.push(NoticedURL.STACK_TYPE_CORE, entry); } if (remote) { - sb.crawlQueues.noticeURL.push(NoticedURL.STACK_TYPE_REMOTE, entry); + nextQueue.noticeURL.push(NoticedURL.STACK_TYPE_REMOTE, entry); } - } + return null; } + + /** + * Test a url if it can be used for crawling/indexing + * This mainly checks if the url is in the declared domain (local/global) + * @param url + * @return null if the url can be accepted, a string containing a rejection reason if the url cannot be accepted + */ + public String urlInAcceptedDomain(final yacyURL url) { + // returns true if the url can be accepted accoring to network.unit.domain + if (url == null) return "url is null"; + final String host = url.getHost(); + if (host == null) return "url.host is null"; + if (this.acceptGlobalURLs && this.acceptLocalURLs) return null; // fast shortcut to avoid dnsResolve + /* + InetAddress hostAddress = serverDomains.dnsResolve(host); + // if we don't know the host, we cannot load that resource anyway. + // But in case we use a proxy, it is possible that we dont have a DNS service. + final httpRemoteProxyConfig remoteProxyConfig = httpdProxyHandler.getRemoteProxyConfig(); + if (hostAddress == null) { + if ((remoteProxyConfig != null) && (remoteProxyConfig.useProxy())) return null; else return "the dns of the host '" + host + "' cannot be resolved"; + } + */ + // check if this is a local address and we are allowed to index local pages: + //boolean local = hostAddress.isSiteLocalAddress() || hostAddress.isLoopbackAddress(); + final boolean local = url.isLocal(); + //assert local == yacyURL.isLocalDomain(url.hash()); // TODO: remove the dnsResolve above! + if ((this.acceptGlobalURLs && !local) || (this.acceptLocalURLs && local)) return null; + return (local) ? + ("the host '" + host + "' is local, but local addresses are not accepted") : + ("the host '" + host + "' is global, but global addresses are not accepted"); + } + + public boolean acceptLocalURLs() { + return this.acceptLocalURLs; + } + + public boolean acceptGlobalURLs() { + return this.acceptGlobalURLs; + } } diff --git a/source/de/anomic/plasma/plasmaSearchAPI.java b/source/de/anomic/plasma/plasmaSearchAPI.java index 5634bd07da..274e1a2478 100644 --- a/source/de/anomic/plasma/plasmaSearchAPI.java +++ b/source/de/anomic/plasma/plasmaSearchAPI.java @@ -73,7 +73,7 @@ public static void listHosts(final serverObjects prop, final String startHash, f yacySeed seed; int hc = 0; prop.put("searchresult_keyhash", startHash); - final Iterator e = yacyPeerSelection.getAcceptRemoteIndexSeeds(sb.webIndex.seedDB, startHash, sb.webIndex.seedDB.sizeConnected()); + final Iterator e = yacyPeerSelection.getAcceptRemoteIndexSeeds(sb.webIndex.seedDB, startHash, sb.webIndex.seedDB.sizeConnected(), true); while (e.hasNext()) { seed = e.next(); if (seed != null) { diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 04fe2da698..4fea492a40 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -234,7 +234,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch clusterhashes; // map of peerhash(String)/alternative-local-address as ip:port or only ip (String) or null if address in seed should be used - public boolean acceptLocalURLs, acceptGlobalURLs; public URLLicense licensedURLs; public Timer moreMemory; @@ -548,9 +547,11 @@ public plasmaSwitchboard(final File rootPath, final String initPath, final Strin this.observer.resourceObserverJob(); // initializing the stackCrawlThread - this.crawlStacker = new CrawlStacker(this, this.plasmaPath, (int) getConfigLong("tableTypeForPreNURL", 0), (((int) getConfigLong("tableTypeForPreNURL", 0) == 0) && (getConfigLong(plasmaSwitchboardConstants.CRAWLSTACK_BUSYSLEEP, 0) <= 100))); - //this.sbStackCrawlThread = new plasmaStackCrawlThread(this,this.plasmaPath,ramPreNURL); - //this.sbStackCrawlThread.start(); + this.crawlStacker = new CrawlStacker( + crawlQueues, + this.webIndex, + "local.any".indexOf(getConfig("network.unit.domain", "global")) >= 0, + "global.any".indexOf(getConfig("network.unit.domain", "global")) >= 0); // initializing dht chunk generation this.dhtTransferChunk = null; @@ -680,10 +681,6 @@ public void overwriteNetworkDefinition() { // initiate url license object licensedURLs = new URLLicense(8); - // set URL domain acceptance - acceptGlobalURLs = "global.any".indexOf(getConfig("network.unit.domain", "global")) >= 0; - acceptLocalURLs = "local.any".indexOf(getConfig("network.unit.domain", "global")) >= 0; - /* // in intranet and portal network set robinson mode if (networkUnitDefinition.equals("defaults/yacy.network.webportal.unit") || @@ -736,7 +733,7 @@ public void switchNetwork(final String networkDefinition) { // that an automatic authorization of localhost is done, because in this case crawls from local // addresses are blocked to prevent attack szenarios where remote pages contain links to localhost // addresses that can steer a YaCy peer - if ((this.acceptLocalURLs) && (getConfigBool("adminAccountForLocalhost", false))) { + if ((crawlStacker.acceptLocalURLs()) && (getConfigBool("adminAccountForLocalhost", false))) { setConfig("adminAccountForLocalhost", false); if (getConfig(httpd.ADMIN_ACCOUNT_B64MD5, "").startsWith("0000")) { // the password was set automatically with a random value. @@ -856,36 +853,6 @@ public boolean isInMyCluster(final yacySeed seed) { } } - /** - * Test a url if it can be used for crawling/indexing - * This mainly checks if the url is in the declared domain (local/global) - * @param url - * @return null if the url can be accepted, a string containing a rejection reason if the url cannot be accepted - */ - public String acceptURL(final yacyURL url) { - // returns true if the url can be accepted accoring to network.unit.domain - if (url == null) return "url is null"; - final String host = url.getHost(); - if (host == null) return "url.host is null"; - if (this.acceptGlobalURLs && this.acceptLocalURLs) return null; // fast shortcut to avoid dnsResolve - /* - InetAddress hostAddress = serverDomains.dnsResolve(host); - // if we don't know the host, we cannot load that resource anyway. - // But in case we use a proxy, it is possible that we dont have a DNS service. - final httpRemoteProxyConfig remoteProxyConfig = httpdProxyHandler.getRemoteProxyConfig(); - if (hostAddress == null) { - if ((remoteProxyConfig != null) && (remoteProxyConfig.useProxy())) return null; else return "the dns of the host '" + host + "' cannot be resolved"; - } - */ - // check if this is a local address and we are allowed to index local pages: - //boolean local = hostAddress.isSiteLocalAddress() || hostAddress.isLoopbackAddress(); - final boolean local = url.isLocal(); - //assert local == yacyURL.isLocalDomain(url.hash()); // TODO: remove the dnsResolve above! - if ((this.acceptGlobalURLs && !local) || (this.acceptLocalURLs && local)) return null; - return (local) ? - ("the host '" + host + "' is local, but local addresses are not accepted") : - ("the host '" + host + "' is global, but global addresses are not accepted"); - } public String urlExists(final String hash) { // tests if hash occurrs in any database @@ -992,7 +959,7 @@ public boolean htEntryStoreProcess(final indexDocumentMetadata entry) { * * check if ip is local ip address // TODO: remove this procotol specific code here * ========================================================================= */ - final String urlRejectReason = acceptURL(entry.url()); + final String urlRejectReason = crawlStacker.urlInAcceptedDomain(entry.url()); if (urlRejectReason != null) { if (this.log.isFine()) this.log.logFine("Rejected URL '" + entry.url() + "': " + urlRejectReason); doIndexing = false; @@ -1298,7 +1265,7 @@ public boolean cleanupJob() { } // set a random password if no password is configured - if (!this.acceptLocalURLs && getConfigBool("adminAccountForLocalhost", false) && getConfig(httpd.ADMIN_ACCOUNT_B64MD5, "").length() == 0) { + if (!crawlStacker.acceptLocalURLs() && getConfigBool("adminAccountForLocalhost", false) && getConfig(httpd.ADMIN_ACCOUNT_B64MD5, "").length() == 0) { // make a 'random' password setConfig(httpd.ADMIN_ACCOUNT_B64MD5, "0000" + serverCodings.encodeMD5Hex(System.getProperties().toString() + System.currentTimeMillis())); setConfig("adminAccount", ""); @@ -1998,7 +1965,7 @@ public boolean dhtTransferProcess(final plasmaDHTChunk dhtChunk, final int peerC try { // find a list of DHT-peers if (log != null) log.logInfo("Collecting DHT target peers for first_hash = " + dhtChunk.firstContainer().getWordHash() + ", last_hash = " + dhtChunk.lastContainer().getWordHash()); - final Iterator seedIter = yacyPeerSelection.getAcceptRemoteIndexSeeds(webIndex.seedDB, dhtChunk.lastContainer().getWordHash(), peerCount + 9); + final Iterator seedIter = yacyPeerSelection.getAcceptRemoteIndexSeeds(webIndex.seedDB, dhtChunk.lastContainer().getWordHash(), peerCount + 9, false); // send away the indexes to all these peers int hc1 = 0; diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java index a57be65fc8..841f590132 100644 --- a/source/de/anomic/yacy/yacyClient.java +++ b/source/de/anomic/yacy/yacyClient.java @@ -548,7 +548,7 @@ public static String[] search( continue; // block with backlist } - final String urlRejectReason = plasmaSwitchboard.getSwitchboard().acceptURL(comp.url()); + final String urlRejectReason = plasmaSwitchboard.getSwitchboard().crawlStacker.urlInAcceptedDomain(comp.url()); if (urlRejectReason != null) { yacyCore.log.logInfo("remote search (client): rejected url '" + comp.url() + "' (" + urlRejectReason + ") from peer " + target.getName()); continue; // reject url outside of our domain diff --git a/source/de/anomic/yacy/yacyPeerSelection.java b/source/de/anomic/yacy/yacyPeerSelection.java index a679d17bb6..7efe954c55 100644 --- a/source/de/anomic/yacy/yacyPeerSelection.java +++ b/source/de/anomic/yacy/yacyPeerSelection.java @@ -28,6 +28,7 @@ import java.util.HashSet; import java.util.Iterator; +import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroException; import de.anomic.kelondro.kelondroMScoreCluster; import de.anomic.server.serverDate; @@ -48,7 +49,7 @@ public static void selectDHTPositions(final yacySeedDB seedDB, String wordhash, long distance; for (int v = 0; v < dhtVerticalTargets.length; v++) { wordhash = yacySeed.positionToHash(dhtVerticalTargets[v]); - Iterator dhtEnum = getAcceptRemoteIndexSeeds(seedDB, wordhash, redundancy); + Iterator dhtEnum = getAcceptRemoteIndexSeeds(seedDB, wordhash, redundancy, false); int c = Math.min(seedDB.sizeConnected(), redundancy); int cc = 3; // select a maximum of 3, this is enough redundancy while (dhtEnum.hasNext() && c > 0 && cc-- > 0) { @@ -64,23 +65,24 @@ public static void selectDHTPositions(final yacySeedDB seedDB, String wordhash, } } - public static boolean verifyIfOwnWord(final yacySeedDB seedDB, final String wordhash, int redundancy) { + public static boolean verifyIfOwnWord(final yacySeedDB seedDB, String wordhash, int redundancy) { String myHash = seedDB.mySeed().hash; - long[] dhtVerticalTargets = yacySeed.dhtPositions(wordhash, yacySeed.partitionExponent); - for (int v = 0; v < dhtVerticalTargets.length; v++) { - Iterator dhtEnum = getAcceptRemoteIndexSeeds(seedDB, yacySeed.positionToHash(dhtVerticalTargets[v]), redundancy); + //long[] dhtVerticalTargets = yacySeed.dhtPositions(wordhash, yacySeed.partitionExponent); + //for (int v = 0; v < dhtVerticalTargets.length; v++) { + //wordhash = yacySeed.positionToHash(dhtVerticalTargets[0]); + Iterator dhtEnum = getAcceptRemoteIndexSeeds(seedDB, wordhash, redundancy, true); while (dhtEnum.hasNext()) { - if (dhtEnum.next().equals(myHash)) return true; + if (dhtEnum.next().hash.equals(myHash)) return true; } - } + //} return false; } - public static Iterator getAcceptRemoteIndexSeeds(yacySeedDB seedDB, final String starthash, int max) { + public static Iterator getAcceptRemoteIndexSeeds(yacySeedDB seedDB, final String starthash, int max, boolean alsoMyOwn) { // returns an enumeration of yacySeed-Objects // that have the AcceptRemoteIndex-Flag set // the seeds are enumerated in the right order according DHT - return new acceptRemoteIndexSeedEnum(seedDB, starthash, Math.max(max, seedDB.sizeConnected())); + return new acceptRemoteIndexSeedEnum(seedDB, starthash, Math.min(max, seedDB.sizeConnected()), alsoMyOwn); } private static class acceptRemoteIndexSeedEnum implements Iterator { @@ -90,13 +92,15 @@ private static class acceptRemoteIndexSeedEnum implements Iterator { private yacySeedDB seedDB; private HashSet doublecheck; private int remaining; + private boolean alsoMyOwn; - public acceptRemoteIndexSeedEnum(yacySeedDB seedDB, final String starthash, int max) { + public acceptRemoteIndexSeedEnum(yacySeedDB seedDB, final String starthash, int max, boolean alsoMyOwn) { this.seedDB = seedDB; this.se = getDHTSeeds(seedDB, starthash, yacyVersion.YACY_HANDLES_COLLECTION_INDEX); this.remaining = max; this.doublecheck = new HashSet(); this.nextSeed = nextInternal(); + this.alsoMyOwn = alsoMyOwn && (kelondroBase64Order.enhancedCoder.compare(seedDB.mySeed().hash.getBytes(), nextSeed.hash.getBytes()) > 0); } public boolean hasNext() { @@ -127,9 +131,15 @@ private yacySeed nextInternal() { } public yacySeed next() { - final yacySeed next = nextSeed; - nextSeed = nextInternal(); - return next; + if (alsoMyOwn && kelondroBase64Order.enhancedCoder.compare(seedDB.mySeed().hash.getBytes(), nextSeed.hash.getBytes()) < 0) { + // take my own seed hash instead the enumeration result + alsoMyOwn = false; + return seedDB.mySeed(); + } else { + final yacySeed next = nextSeed; + nextSeed = nextInternal(); + return next; + } } public void remove() { diff --git a/source/de/anomic/yacy/yacySeed.java b/source/de/anomic/yacy/yacySeed.java index e1e08c402a..8f818f4d4f 100644 --- a/source/de/anomic/yacy/yacySeed.java +++ b/source/de/anomic/yacy/yacySeed.java @@ -1017,19 +1017,45 @@ public final yacySeed clone() { private static int guessedOwn = 0; + //private static int guessedNotOwn = 0; private static int verifiedOwn = 0; + private static int verifiedNotOwn = 0; public static boolean shallBeOwnWord(final yacySeedDB seedDB, final String wordhash, int redundancy) { - if (!guessIfOwnWord(seedDB, wordhash)) return false; - guessedOwn++; - if (yacyPeerSelection.verifyIfOwnWord(seedDB, wordhash, redundancy)) { - verifiedOwn++; - System.out.println("*** DEBUG shallBeOwnWord: true. verified/guessed ration = " + verifiedOwn + "/" + guessedOwn); - return true; + // the guessIfOwnWord is a fast method that should only fail in case that a 'true' may be incorrect, but a 'false' shall always be correct + if (guessIfOwnWord(seedDB, wordhash)) { + // this case must be verified, because it can be wrong. + guessedOwn++; + if (yacyPeerSelection.verifyIfOwnWord(seedDB, wordhash, redundancy)) { + // this is the correct case, but does not need to be an average case + verifiedOwn++; + //System.out.println("*** DEBUG shallBeOwnWord: true. guessed: true. verified/guessed ration = " + verifiedOwn + "/" + guessedOwn); + return true; + } else { + // this may happen, but can be corrected + verifiedNotOwn++; + //System.out.println("*** DEBUG shallBeOwnWord: false. guessed: true. verified/guessed ration = " + verifiedNotOwn + "/" + guessedNotOwn); + return false; + } } else { - System.out.println("*** DEBUG shallBeOwnWord: false. verified/guessed ration = " + verifiedOwn + "/" + guessedOwn); return false; + /* + // this should mean that the guessing should not be wrong + guessedNotOwn++; + if (yacyPeerSelection.verifyIfOwnWord(seedDB, wordhash, redundancy)) { + // this should never happen + verifiedOwn++; + System.out.println("*** DEBUG shallBeOwnWord: true. guessed: false. verified/guessed ration = " + verifiedOwn + "/" + guessedOwn); + return true; + } else { + // this should always happen + verifiedNotOwn++; + //System.out.println("*** DEBUG shallBeOwnWord: false. guessed: false. verified/guessed ration = " + verifiedNotOwn + "/" + guessedNotOwn); + return false; + } + */ } + } private static boolean guessIfOwnWord(final yacySeedDB seedDB, final String wordhash) {