Skip to content

Commit

Permalink
joined anomic.net.URL, plasmaURL and url hash computation:
Browse files Browse the repository at this point in the history
search profiling showed, that a major amount of time is wasted by computing url hashes. The computation does an intranet-check, which needs a DNS lookup. This caused that each urlhash computation needed 100-200 milliseconds, which caused remote searches to delay at least 1 second more that necessary. The solution to this problem is to attach a URL hash to the URL data structure, because that means that the url hash value can be filled after retrieval of the URL from the database. The redesign of the url/urlhash management caused a major redesign of many parts of the software. Since some parts had been decided to be given up they had been removed during this change to avoid unnecessary maintenance of unused code.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4074 6c8d7289-2bf4-0310-a012-ef5d649a1542
  • Loading branch information
orbiter committed Sep 5, 2007
1 parent 66905b7 commit daf0f74
Show file tree
Hide file tree
Showing 160 changed files with 1,481 additions and 10,006 deletions.
39 changes: 3 additions & 36 deletions build.xml
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@

<!-- compiling the main sources -->
<javac srcdir="${src}/" destdir="${build}"
excludes="de/anomic/plasma/parser/*/*,de/anomic/yacy/seedUpload/**,de/anomic/soap/**,yacy.java,de/anomic/server/portForwarding/*/*,de/anomic/data/rssReader.java"
excludes="de/anomic/plasma/parser/*/*,de/anomic/yacy/seedUpload/**,yacy.java,de/anomic/server/portForwarding/*/*,de/anomic/data/rssReader.java"
debug="true" debuglevel="lines,vars,source"
source="${javacSource}" target="${javacTarget}">
<classpath refid="project.class.path"/>
Expand All @@ -241,7 +241,6 @@
<javac srcdir="${htroot}/"
classpathref="project.class.path"
debug="true" debuglevel="lines,vars,source"
excludes="soap/*"
source="${javacSource}" target="${javacTarget}"/>
</target>

Expand Down Expand Up @@ -297,33 +296,6 @@
<fileset dir="${src}/" includes="de/anomic/yacy/seedUpload/yacySeedUpload*.xml"/>
</subant>
</target>

<!-- compiling optional soap API and building install packages -->
<target name="compileSoap" depends="compileMain" description="Compiling and zipping additional yacy SOAP API">
<subant target="${extensionTarget}">
<property name="src" location="${src}"/>
<property name="build" location="${build}"/>
<property name="libx" location="${libx}"/>
<property name="htroot" value="${htroot}"/>
<property name="release" location="${release_ext}"/>
<property name="javacSource" value="${javacSource}"/>
<property name="javacTarget" value="${javacTarget}"/>
<fileset dir="${src}/" includes="de/anomic/soap/build.xml"/>
</subant>
</target>

<target name="distSoapClientStubJar" depends="init" description="Generates a jar file with all client stub classes for the YaCy SOAP API">
<subant target="buildClientStubJar">
<property name="src" location="${src}"/>
<property name="build" location="${build}"/>
<property name="libx" location="${libx}"/>
<property name="htroot" value="${htroot}"/>
<property name="release" location="${release_ext}"/>
<property name="javacSource" value="${javacSource}"/>
<property name="javacTarget" value="${javacTarget}"/>
<fileset dir="${src}/" includes="de/anomic/soap/build.xml"/>
</subant>
</target>

<target name="compilePortForwarding" depends="compileMain" description="Compiling and zipping additional port forwarder">
<javac srcdir="${src}/de/anomic/server/portForwarding" destdir="${build}" source="${javacSource}" target="${javacTarget}" debug="true" debuglevel="lines,vars,source">
Expand Down Expand Up @@ -359,7 +331,7 @@
</target>

<!-- compile optional classs that were not compiled elsewhere -->
<target name="compileExtensions" depends="compileMain,compileParsers,compileSeedUploaders,compileSoap,compilePortForwarding">
<target name="compileExtensions" depends="compileMain,compileParsers,compileSeedUploaders,compilePortForwarding">

<!-- compile rss Reader -->
<javac srcdir="${src}" destdir="${build}"
Expand Down Expand Up @@ -423,9 +395,7 @@
<!-- excluding all additional content parsers -->
<exclude name="de/anomic/plasma/parser/*/*"/>
<!-- excluding all additional seed uploaders -->
<exclude name="de/anomic/yacy/seedUpload/**"/>
<!-- excluding the soap handler -->
<exclude name="de/anomic/soap/**"/>
<exclude name="de/anomic/yacy/seedUpload/**"/>
<!-- excluding the port forwarder -->
<exclude name="de/anomic/server/portForwarding/*/*"/>
<!-- ecluding rss Reader class -->
Expand Down Expand Up @@ -527,8 +497,6 @@
<exclude name="de/anomic/plasma/parser/*/*"/>
<!-- excluding sources for additional seed uploaders -->
<exclude name="de/anomic/yacy/seedUpload/yacySeedUpload**"/>
<!-- excluding soap -->
<exclude name="de/anomic/soap/**"/>
<!-- excluding the port forwarder -->
<exclude name="de/anomic/server/portForwarding/*/*"/>
<!-- ecluding rss Reader class -->
Expand All @@ -545,7 +513,6 @@
<fileset dir="${htroot}">
<include name="**/*"/>
<exclude name="yacy/seedUpload/**"/>
<exclude name="soap/**"/>
</fileset>
<fileset dir="${htroot}">
<include name="yacy/seedUpload/yacySeedUploadFile.html"/>
Expand Down
6 changes: 3 additions & 3 deletions htroot/Blacklist_p.java
Original file line number Diff line number Diff line change
Expand Up @@ -60,14 +60,14 @@

import de.anomic.data.listManager;
import de.anomic.http.httpHeader;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.urlPattern.abstractURLPattern;
import de.anomic.plasma.urlPattern.plasmaURLPattern;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.yacy.yacyCore;
import de.anomic.yacy.yacySeed;
import de.anomic.yacy.yacyURL;

public class Blacklist_p {
private final static String DISABLED = "disabled_";
Expand Down Expand Up @@ -95,9 +95,9 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
prop.put("testlist",1);
String urlstring = post.get("testurl", "");
if(!urlstring.startsWith("http://")) urlstring = "http://"+urlstring;
URL testurl = null;
yacyURL testurl = null;
try {
testurl = new URL(urlstring);
testurl = new yacyURL(urlstring, null);
} catch (MalformedURLException e) { }
if(testurl != null) {
prop.put("testlist_url",testurl.toString());
Expand Down
4 changes: 2 additions & 2 deletions htroot/Bookmarks.java
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,6 @@
import de.anomic.http.httpHeader;
import de.anomic.http.httpc;
import de.anomic.index.indexURLEntry;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaParserDocument;
import de.anomic.plasma.plasmaSnippetCache;
import de.anomic.plasma.plasmaSwitchboard;
Expand All @@ -69,6 +68,7 @@
import de.anomic.yacy.yacyCore;
import de.anomic.yacy.yacyNewsPool;
import de.anomic.yacy.yacyNewsRecord;
import de.anomic.yacy.yacyURL;

public class Bookmarks {
public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) {
Expand Down Expand Up @@ -214,7 +214,7 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
}
try {
File file=new File((String)post.get("bookmarksfile"));
switchboard.bookmarksDB.importFromBookmarks(new URL(file) , new String((byte[])post.get("bookmarksfile$file")), tags, isPublic);
switchboard.bookmarksDB.importFromBookmarks(new yacyURL(file) , new String((byte[])post.get("bookmarksfile$file")), tags, isPublic);
} catch (MalformedURLException e) {}

}else if(post.containsKey("xmlfile")){
Expand Down
4 changes: 2 additions & 2 deletions htroot/CacheAdmin_p.java
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,6 @@
import de.anomic.htmlFilter.htmlFilterImageEntry;
import de.anomic.htmlFilter.htmlFilterWriter;
import de.anomic.http.httpHeader;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaHTCache;
import de.anomic.plasma.plasmaParserDocument;
import de.anomic.plasma.plasmaSwitchboard;
Expand All @@ -71,6 +70,7 @@
import de.anomic.server.serverFileUtils;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.yacy.yacyURL;

public class CacheAdmin_p {

Expand Down Expand Up @@ -118,7 +118,7 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
final StringBuffer tree = new StringBuffer();
final StringBuffer info = new StringBuffer();

final URL url = plasmaHTCache.getURL(file);
final yacyURL url = plasmaHTCache.getURL(file);

String urlstr = "";

Expand Down
4 changes: 2 additions & 2 deletions htroot/ConfigLanguage_p.java
Original file line number Diff line number Diff line change
Expand Up @@ -58,11 +58,11 @@
import de.anomic.data.translator;
import de.anomic.http.httpHeader;
import de.anomic.http.httpc;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.tools.nxTools;
import de.anomic.yacy.yacyURL;


public class ConfigLanguage_p {
Expand Down Expand Up @@ -97,7 +97,7 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
String url = (String)post.get("url");
ArrayList langVector;
try{
URL u = new URL(url);
yacyURL u = new yacyURL(url, null);
langVector = nxTools.strings(httpc.wget(u, u.getHost(), 6000, null, null, switchboard.remoteProxyConfig, null, null), "UTF-8");
}catch(IOException e){
prop.put("status", 1);//unable to get url
Expand Down
4 changes: 2 additions & 2 deletions htroot/ConfigSkins_p.java
Original file line number Diff line number Diff line change
Expand Up @@ -56,12 +56,12 @@
import de.anomic.data.listManager;
import de.anomic.http.httpHeader;
import de.anomic.http.httpc;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverFileUtils;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.tools.nxTools;
import de.anomic.yacy.yacyURL;

public class ConfigSkins_p {

Expand Down Expand Up @@ -126,7 +126,7 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
String url = (String)post.get("url");
ArrayList skinVector;
try{
URL u = new URL(url);
yacyURL u = new yacyURL(url, null);
skinVector = nxTools.strings(httpc.wget(u, u.getHost(), 6000, null, null, switchboard.remoteProxyConfig, null, null), "UTF-8");
}catch(IOException e){
prop.put("status", 1);//unable to get URL
Expand Down
4 changes: 2 additions & 2 deletions htroot/ConfigUpdate_p.java
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,11 @@
import java.util.TreeSet;

import de.anomic.http.httpHeader;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.server.serverSystem;
import de.anomic.yacy.yacyURL;
import de.anomic.yacy.yacyVersion;

public class ConfigUpdate_p {
Expand All @@ -54,7 +54,7 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
String release = post.get("releasedownload", "");
if (release.length() > 0) {
try {
yacyVersion.downloadRelease(new yacyVersion(new URL(release)));
yacyVersion.downloadRelease(new yacyVersion(new yacyURL(release, null)));
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
Expand Down
4 changes: 2 additions & 2 deletions htroot/CrawlResults.java
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@

import de.anomic.http.httpHeader;
import de.anomic.index.indexURLEntry;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaHTCache;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverObjects;
Expand All @@ -39,6 +38,7 @@
import de.anomic.tools.nxTools;
import de.anomic.yacy.yacyCore;
import de.anomic.yacy.yacySeed;
import de.anomic.yacy.yacyURL;

public class CrawlResults {

Expand Down Expand Up @@ -170,7 +170,7 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve

urlstr = comp.url().toNormalform(false, true);
urltxt = nxTools.shortenURLString(urlstr, 72); // shorten the string text like a URL
cachepath = plasmaHTCache.getCachePath(new URL(urlstr)).toString().replace('\\', '/').substring(plasmaHTCache.cachePath.toString().length() + 1);
cachepath = plasmaHTCache.getCachePath(new yacyURL(urlstr, null)).toString().replace('\\', '/').substring(plasmaHTCache.cachePath.toString().length() + 1);

prop.put("table_indexed_" + cnt + "_dark", (dark) ? 1 : 0);
if (showControl) {
Expand Down
2 changes: 1 addition & 1 deletion htroot/CrawlStartSimple_p.html
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ <h2>Easy Crawl Start</h2>
<td><label for="crawlingDepth">Crawling Range</label>:</td>
<td>
<input type="radio" name="range" value="wide" checked="checked" />Wide: depth <input name="crawlingDepth" id="crawlingDepth" type="text" size="2" maxlength="2" value="#[crawlingDepth]#" />&nbsp;&nbsp;|&nbsp;&nbsp;
<input type="radio" name="range" value="domain" />Complete Single Domain
<input type="radio" name="range" value="domain" />Complete Domain
</td>
<td>
The range defines if the crawl shall consider a complete domain, or a wide crawl up to a specific depth.
Expand Down
6 changes: 3 additions & 3 deletions htroot/CrawlStartSimple_p.java
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,14 @@
import java.util.Iterator;

import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaURL;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.yacy.yacyCore;
import de.anomic.yacy.yacyNewsPool;
import de.anomic.yacy.yacyNewsRecord;
import de.anomic.yacy.yacySeed;
import de.anomic.yacy.yacyURL;

public class CrawlStartSimple_p {

Expand Down Expand Up @@ -152,8 +152,8 @@ record = (yacyNewsRecord) recordIterator.next();
if ((yacyCore.seedDB == null) || (yacyCore.seedDB.mySeed.isVirgin()) || (yacyCore.seedDB.mySeed.isJunior())) {
prop.put("remoteCrawlPeers", 0);
} else {
Enumeration crawlavail = yacyCore.dhtAgent.getAcceptRemoteCrawlSeeds(plasmaURL.dummyHash, true);
Enumeration crawlpendi = yacyCore.dhtAgent.getAcceptRemoteCrawlSeeds(plasmaURL.dummyHash, false);
Enumeration crawlavail = yacyCore.dhtAgent.getAcceptRemoteCrawlSeeds(yacyURL.dummyHash, true);
Enumeration crawlpendi = yacyCore.dhtAgent.getAcceptRemoteCrawlSeeds(yacyURL.dummyHash, false);
if ((!(crawlavail.hasMoreElements())) && (!(crawlpendi.hasMoreElements()))) {
prop.put("remoteCrawlPeers", 0); //no peers availible
} else {
Expand Down
12 changes: 6 additions & 6 deletions htroot/CrawlURLFetchStack_p.java
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,6 @@
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.htmlFilter.htmlFilterWriter;
import de.anomic.http.httpHeader;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlEntry;
import de.anomic.plasma.plasmaCrawlNURL;
import de.anomic.plasma.plasmaSwitchboard;
Expand All @@ -64,6 +63,7 @@
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.server.logging.serverLog;
import de.anomic.yacy.yacyURL;

public class CrawlURLFetchStack_p {

Expand Down Expand Up @@ -200,16 +200,16 @@ else if (post.containsKey("subupload")) {
prop.put("upload", 1);
} else if (type.equals("html")) {
try {
final htmlFilterContentScraper scraper = new htmlFilterContentScraper(new URL(file));
final htmlFilterContentScraper scraper = new htmlFilterContentScraper(new yacyURL(file));
final Writer writer = new htmlFilterWriter(null, null, scraper, null, false);
serverFileUtils.write(content, writer);
writer.close();

final Iterator it = ((HashMap)scraper.getAnchors()).keySet().iterator();
int added = 0, failed = 0;
URL url;
yacyURL url;
while (it.hasNext()) try {
url = new URL((String)it.next());
url = new yacyURL((String) it.next(), null);
if (blCheck && plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_CRAWLER, url)) {
failed++;
continue;
Expand Down Expand Up @@ -264,7 +264,7 @@ private static int addURLs(String[] urls, boolean blCheck, URLFetcherStack stack
private static boolean addURL(String url, boolean blCheck, URLFetcherStack stack) {
try {
if (url == null || url.length() == 0) return false;
URL u = new URL(url);
yacyURL u = new yacyURL(url, null);
if (blCheck && plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_CRAWLER, u)) return false;
stack.push(u);
return true;
Expand All @@ -288,7 +288,7 @@ private static int addURLs(serverObjects post, int amount, URLFetcherStack stack
url = post.get("url" + i, null);
if (url == null || url.length() == 0) continue;
try {
stack.push(new URL(url));
stack.push(new yacyURL(url, null));
count++;
} catch (MalformedURLException e) {
serverLog.logInfo("URLFETCHER", "retrieved invalid url for adding to the stack: " + url);
Expand Down
Loading

0 comments on commit daf0f74

Please sign in to comment.