Skip to content

Commit

Permalink
- enhancements to DNS IP caching and crawler speed
Browse files Browse the repository at this point in the history
- bugfixes (NPEs)

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7619 6c8d7289-2bf4-0310-a012-ef5d649a1542
  • Loading branch information
orbiter committed Mar 22, 2011
1 parent e7860b1 commit f3baaca
Show file tree
Hide file tree
Showing 8 changed files with 66 additions and 86 deletions.
4 changes: 2 additions & 2 deletions htroot/IndexControlRWIs_p.html
Original file line number Diff line number Diff line change
Expand Up @@ -41,13 +41,13 @@ <h2>Reverse Word Index Administration</h2>
<dl>
<dt class="TableCellDark">Index Deletion</dt>
<dd><input type="checkbox" name="deleteIndex" id="deleteIndex"
onclick="x=document.getElementById('deleteIndex').checked;document.getElementById('deleteCache').checked=x;document.getElementById('deleteRobots').checked=x;document.getElementById('deleteCrawlQueues').checked=x;c='disabled';document.getElementById('deleteSearchFl').checked=x;if(x){c='';}document.getElementById('deleteCache').disabled=c;document.getElementById('deleteRobots').disabled=c;document.getElementById('deleteCrawlQueues').disabled=c;document.getElementById('deleteSearchFl').disabled=c;"
onclick="x=document.getElementById('deleteIndex').checked;document.getElementById('deleteCache').checked=x;document.getElementById('deleteRobots').checked=x;document.getElementById('deleteCrawlQueues').checked=x;c='disabled';document.getElementById('deleteSearchFl').checked=x;if(x){c='';}document.getElementById('deletecomplete').disabled=c;document.getElementById('deleteCache').disabled=c;document.getElementById('deleteRobots').disabled=c;document.getElementById('deleteCrawlQueues').disabled=c;document.getElementById('deleteSearchFl').disabled=c;"
/><label for="deleteIndex">Delete Search Index</label><br/>
<input type="checkbox" name="deleteCrawlQueues" id="deleteCrawlQueues" disabled="disabled" /><label for="deleteCrawlQueues">Stop Crawler and delete Crawl Queues</label><br/>
<input type="checkbox" name="deleteCache" id="deleteCache" disabled="disabled" /><label for="deleteCache">Delete HTTP &amp; FTP Cache</label><br/>
<input type="checkbox" name="deleteRobots" id="deleteRobots" disabled="disabled" /><label for="deleteRobots">Delete robots.txt Cache</label><br/>
<input type="checkbox" name="deleteSearchFl" id="deleteSearchFl" disabled="disabled" /><label for="deleteSearchFl">Delete cached snippet-fetching failures during search</label><br/><br/><br/>
<input type="submit" name="deletecomplete" value="Delete"/>
<input type="submit" name="deletecomplete" id="deletecomplete" value="Delete" disabled="disabled"/>
</dd>
</dl>
</fieldset>
Expand Down
40 changes: 11 additions & 29 deletions htroot/IndexImportWikimedia_p.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,9 @@
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

import java.io.File;
import java.net.MalformedURLException;

import net.yacy.cora.protocol.RequestHeader;
import net.yacy.document.importer.MediawikiImporter;
import net.yacy.kelondro.logging.Log;

import de.anomic.search.Switchboard;
import de.anomic.server.serverObjects;
Expand Down Expand Up @@ -57,33 +55,17 @@ public static serverObjects respond(final RequestHeader header, final serverObje
} else {
if (post.containsKey("file")) {
final File sourcefile = new File(post.get("file"));
//final String name = sourcefile.getName(); // i.e. dewiki-20090311-pages-articles.xml.bz2
/*
if (!name.endsWith("pages-articles.xml.bz2")) {
prop.put("import", 0);
prop.put("import_status", 1);
prop.put("import_status_message", "file name must end with 'pages-articles.xml.bz2'");
return prop;
}
*/
try {
MediawikiImporter.job = new MediawikiImporter(sourcefile, sb.surrogatesInPath);
MediawikiImporter.job.start();
prop.put("import", 1);
prop.put("import_thread", "started");
prop.put("import_dump", MediawikiImporter.job.source());
prop.put("import_count", 0);
prop.put("import_speed", 0);
prop.put("import_runningHours", 0);
prop.put("import_runningMinutes", 0);
prop.put("import_remainingHours", 0);
prop.put("import_remainingMinutes", 0);
} catch (MalformedURLException e) {
Log.logException(e);
prop.put("import", 0);
prop.put("import_status", 1);
prop.put("import_status_message", e.getMessage());
}
MediawikiImporter.job = new MediawikiImporter(sourcefile, sb.surrogatesInPath);
MediawikiImporter.job.start();
prop.put("import", 1);
prop.put("import_thread", "started");
prop.put("import_dump", MediawikiImporter.job.source());
prop.put("import_count", 0);
prop.put("import_speed", 0);
prop.put("import_runningHours", 0);
prop.put("import_runningMinutes", 0);
prop.put("import_remainingHours", 0);
prop.put("import_remainingMinutes", 0);
}
return prop;
}
Expand Down
2 changes: 1 addition & 1 deletion htroot/PerformanceMemory_p.java
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ public static serverObjects respond(final RequestHeader header, final serverObje
// other caching structures
prop.putNum("namecacheHit.size", Domains.nameCacheHitSize());
prop.putNum("namecacheMiss.size", Domains.nameCacheMissSize());
prop.putNum("namecache.noCache", Domains.nameCacheNoCachingListSize());
prop.putNum("namecache.noCache", 0);
prop.putNum("blacklistcache.size", Switchboard.urlBlacklist.blacklistCacheSize());
prop.putNum("searchevent.size", SearchEventCache.size());
prop.putNum("searchevent.hit", SearchEventCache.cacheHit);
Expand Down
4 changes: 2 additions & 2 deletions source/de/anomic/crawler/Balancer.java
Original file line number Diff line number Diff line change
Expand Up @@ -453,13 +453,13 @@ public Request pop(final boolean delay, final CrawlSwitchboard cs) throws IOExce
Log.logInfo("BALANCER", "forcing crawl-delay of " + sleeptime + " milliseconds for " + crawlEntry.url().getHost() + ": " + Latency.waitingRemainingExplain(crawlEntry.url(), minimumLocalDelta, minimumGlobalDelta) + ", top.size() = " + top.size() + ", delayed.size() = " + delayed.size() + ", domainStacks.size() = " + domainStacks.size() + ", domainStacksInitSize = " + this.domStackInitSize);
long loops = sleeptime / 1000;
long rest = sleeptime % 1000;
if (loops < 2) {
if (loops < 3) {
rest = rest + 1000 * loops;
loops = 0;
}
if (rest > 0) {try {this.wait(rest); } catch (final InterruptedException e) {}}
for (int i = 0; i < loops; i++) {
Log.logInfo("BALANCER", "waiting for " + crawlEntry.url().getHost() + ": " + ((loops - i) * 3) + " seconds remaining...");
Log.logInfo("BALANCER", "waiting for " + crawlEntry.url().getHost() + ": " + (loops - i) + " seconds remaining...");
try {this.wait(1000); } catch (final InterruptedException e) {}
}
}
Expand Down
88 changes: 42 additions & 46 deletions source/net/yacy/cora/protocol/Domains.java
Original file line number Diff line number Diff line change
Expand Up @@ -51,15 +51,14 @@ public class Domains {
private static final String LOCAL_PATTERNS = "10\\..*,127\\..*,172\\.(1[6-9]|2[0-9]|3[0-1])\\..*,169\\.254\\..*,192\\.168\\..*,localhost";
private static final int MAX_NAME_CACHE_HIT_SIZE = 20000;
private static final int MAX_NAME_CACHE_MISS_SIZE = 20000;
private static final int MAX_NAME_NO_CACHING_LIST_SIZE = 20000;
private static final int CONCURRENCY_LEVEL = Runtime.getRuntime().availableProcessors() + 1;

// a dns cache
private static final ARC<String, InetAddress> NAME_CACHE_HIT = new ConcurrentARC<String, InetAddress>(MAX_NAME_CACHE_HIT_SIZE, CONCURRENCY_LEVEL);
private static final ARC<String, String> NAME_CACHE_MISS = new ConcurrentARC<String, String>(MAX_NAME_CACHE_MISS_SIZE, CONCURRENCY_LEVEL);
private static final ARC<String, String> NAME_CACHE_NO_CACHING_LIST = new ConcurrentARC<String, String>(MAX_NAME_NO_CACHING_LIST_SIZE, CONCURRENCY_LEVEL);
public static List<Pattern> nameCacheNoCachingPatterns = Collections.synchronizedList(new LinkedList<Pattern>());
public static final List<Pattern> LOCALHOST_PATTERNS = makePatterns(LOCAL_PATTERNS);
private static final ConcurrentHashMap<String, Object> LOOKUP_SYNC = new ConcurrentHashMap<String, Object>();
private static List<Pattern> nameCacheNoCachingPatterns = Collections.synchronizedList(new LinkedList<Pattern>());
private static final List<Pattern> LOCALHOST_PATTERNS = makePatterns(LOCAL_PATTERNS);

/**
* ! ! ! A T T E N T I O N A T T E N T I O N A T T E N T I O N ! ! !
Expand Down Expand Up @@ -496,46 +495,59 @@ public static InetAddress dnsResolve(String host) {

// try to resolve host by doing a name cache lookup
ip = NAME_CACHE_HIT.get(host);
if (ip != null) return ip;

if (NAME_CACHE_MISS.containsKey(host)) return null;
if (ip != null) {
//System.out.println("DNSLOOKUP-CACHE-HIT(CONC) " + host);
return ip;
}
if (NAME_CACHE_MISS.containsKey(host)) {
//System.out.println("DNSLOOKUP-CACHE-MISS(CONC) " + host);
return null;
}

// call dnsResolveNetBased(host) using concurrency to interrupt execution in case of a time-out
try {
boolean doCaching = true;
ip = InetAddress.getByName(host); //TimeoutRequest.getByName(host, 1000); // this makes the DNS request to backbone
if ((ip == null) ||
(ip.isLoopbackAddress()) ||
(NAME_CACHE_NO_CACHING_LIST.containsKey(host))
) {
doCaching = false;
} else {
if (matchesList(host, nameCacheNoCachingPatterns)) {
NAME_CACHE_NO_CACHING_LIST.put(host, PRESENT);
doCaching = false;
}
final Object sync_obj_new = new Object();
Object sync_obj = LOOKUP_SYNC.putIfAbsent(host, sync_obj_new);
if (sync_obj == null) sync_obj = sync_obj_new;
synchronized (sync_obj) {
// now look again if the host is in the cache where it may be meanwhile because of the synchronization
ip = NAME_CACHE_HIT.get(host);
if (ip != null) {
//System.out.println("DNSLOOKUP-CACHE-HIT(SYNC) " + host);
return ip;
}
if (NAME_CACHE_MISS.containsKey(host)) {
//System.out.println("DNSLOOKUP-CACHE-MISS(SYNC) " + host);
return null;
}

if (doCaching && ip != null) {

// do the dns lookup on the dns server
//if (!matchesList(host, nameCacheNoCachingPatterns)) System.out.println("DNSLOOKUP " + host);
try {
ip = InetAddress.getByName(host); //TimeoutRequest.getByName(host, 1000); // this makes the DNS request to backbone
} catch (final UnknownHostException e) {
// add new entries
NAME_CACHE_MISS.put(host, PRESENT);
LOOKUP_SYNC.remove(host);
return null;
}

if ((ip != null) &&
(!ip.isLoopbackAddress()) &&
(!matchesList(host, nameCacheNoCachingPatterns))
) {
// add new entries
NAME_CACHE_HIT.put(host, ip);
}
LOOKUP_SYNC.remove(host);
return ip;
} catch (final UnknownHostException e) {
// remove old entries
flushMissNameCache();

// add new entries
NAME_CACHE_MISS.put(host, PRESENT);
}
return null;
}

private final static Pattern dotPattern = Pattern.compile("\\.");

private static final InetAddress parseInetAddress(final String ip) {
private static final InetAddress parseInetAddress(String ip) {
if (ip == null || ip.length() < 8) return null;
if (ip.equals("0:0:0:0:0:0:0:1%0")) ip = "127.0.0.1";
final String[] ips = dotPattern.split(ip);
if (ips.length != 4) return null;
final byte[] ipb = new byte[4];
Expand Down Expand Up @@ -567,22 +579,6 @@ public static int nameCacheMissSize() {
return NAME_CACHE_MISS.size();
}

/**
* Returns the number of entries in the nameCacheNoCachingList list
*
* @return int The number of entries in the nameCacheNoCachingList list
*/
public static int nameCacheNoCachingListSize() {
return NAME_CACHE_NO_CACHING_LIST.size();
}

/**
* Removes old entries from the dns miss cache
*/
public static void flushMissNameCache() {
if (NAME_CACHE_MISS.size() > MAX_NAME_CACHE_MISS_SIZE) NAME_CACHE_MISS.clear();
}

private static String localHostName = "127.0.0.1";
private static Set<InetAddress> localHostAddresses = new HashSet<InetAddress>();
private static Set<String> localHostNames = new HashSet<String>();
Expand Down
5 changes: 4 additions & 1 deletion source/net/yacy/document/Condenser.java
Original file line number Diff line number Diff line change
Expand Up @@ -201,9 +201,12 @@ public Condenser(
// images
final Iterator<ImageEntry> j = document.getImages().values().iterator();
ImageEntry ientry;
MultiProtocolURI url;
while (j.hasNext()) {
ientry = j.next();
insertTextToWords(ientry.url().toNormalform(false, false), 99, flag_cat_hasimage, RESULT_FLAGS, false, meaningLib);
url = ientry.url();
if (url == null) continue;
insertTextToWords(url.toNormalform(false, false), 99, flag_cat_hasimage, RESULT_FLAGS, false, meaningLib);
insertTextToWords(ientry.alt(), 99, flag_cat_hasimage, RESULT_FLAGS, true, meaningLib);
}

Expand Down
5 changes: 3 additions & 2 deletions source/net/yacy/document/Document.java
Original file line number Diff line number Diff line change
Expand Up @@ -481,7 +481,7 @@ public static Map<MultiProtocolURI, String> allReflinks(final Collection<?> link
final Map<MultiProtocolURI, String> v = new HashMap<MultiProtocolURI, String>();
final Iterator<?> i = links.iterator();
Object o;
MultiProtocolURI url;
MultiProtocolURI url = null;
String u;
int pos;
loop: while (i.hasNext())
Expand All @@ -495,8 +495,9 @@ else if (o instanceof ImageEntry)
url = ((ImageEntry) o).url();
else {
assert false;
continue;
continue loop;
}
if (url == null) continue loop;
u = url.toNormalform(true, true);
if ((pos = u.toLowerCase().indexOf("http://", 7)) > 0) {
i.remove();
Expand Down
4 changes: 1 addition & 3 deletions source/net/yacy/document/importer/MediawikiImporter.java
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ public class MediawikiImporter extends Thread implements Importer {
private String hostport, urlStub;


public MediawikiImporter(File sourcefile, File targetdir) throws MalformedURLException {
public MediawikiImporter(File sourcefile, File targetdir) {
this.sourcefile = sourcefile;
this.docsize = sourcefile.length();
this.approxdocs = (int) (this.docsize * (long) docspermbinxmlbz2 / 1024L / 1024L);
Expand Down Expand Up @@ -762,8 +762,6 @@ public static void main(String[] s) {
mi.join();
} catch (InterruptedException e) {
Log.logException(e);
} catch (IOException e) {
Log.logException(e);
}
}

Expand Down

0 comments on commit f3baaca

Please sign in to comment.