Skip to content

Commit

Permalink
*) Extending hydrox urlDbCleanup function
Browse files Browse the repository at this point in the history
   - now the function tries to correct the URL first
   - if the url can not be corrected it will be deleted
   See: http://www.yacy-forum.de/viewtopic.php?p=13898

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1197 6c8d7289-2bf4-0310-a012-ef5d649a1542
  • Loading branch information
theli committed Dec 9, 2005
1 parent e7d16ef commit 5a627a6
Showing 1 changed file with 45 additions and 2 deletions.
47 changes: 45 additions & 2 deletions source/yacy.java
Expand Up @@ -67,6 +67,7 @@
import de.anomic.http.httpd;
import de.anomic.http.httpdFileHandler;
import de.anomic.http.httpdProxyHandler;
import de.anomic.http.httpc.response;
import de.anomic.kelondro.kelondroMScoreCluster;
import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaSwitchboard;
Expand Down Expand Up @@ -1195,8 +1196,50 @@ private static void urldbcleanup(String homePath) {
String urlHash;
while (eiter2.hasNext()) {
urlHash = (String) eiter2.next();
currentUrlDB.remove(urlHash);
System.out.println("Removed UrlDB-Entry for urlHash: " + urlHash);

// trying to fix the invalid URL
httpc theHttpc = null;
String oldUrlStr = null;
try {
// getting the url data as byte array
byte[][] entry = currentUrlDB.urlHashCache.get(urlHash.getBytes());

// getting the wrong url string
oldUrlStr = new String(entry[1]).trim();

int pos = -1;
if ((pos = oldUrlStr.indexOf("://"))!= -1) {
// trying to correct the url
String newUrlStr = "http://" + oldUrlStr.substring(pos+3);
URL newUrl = new URL(newUrlStr);

// doing a http head request to test if the url is correct
theHttpc = httpc.getInstance(newUrl.getHost(), newUrl.getPort(), 30000, false);
response res = theHttpc.HEAD(newUrl.getPath(), null);

if (res.statusCode == 200) {
entry[1] = newUrl.toString().getBytes();
currentUrlDB.urlHashCache.put(entry);
System.out.println("UrlDB-Entry with urlHash '" + urlHash +
"' corrected\n\tURL: " + oldUrlStr + " -> " + newUrlStr);
} else {
currentUrlDB.remove(urlHash);
System.out.println("UrlDB-Entry with urlHash '" + urlHash +
"' removed\n\tURL: " + oldUrlStr +
"\n\tConnection Status: " + res.status);
}
}
} catch (Exception e) {
currentUrlDB.remove(urlHash);
System.out.println("UrlDB-Entry with urlHash '" + urlHash +
"' removed\n\tURL: " + oldUrlStr +
"\n\tExecption: " + e.getMessage());
} finally {
if (theHttpc != null) try {
theHttpc.close();
httpc.returnInstance(theHttpc);
} catch (Exception e) {}
}
}
}
plasmaCrawlLURL.damagedURLS.clear();
Expand Down

0 comments on commit 5a627a6

Please sign in to comment.