Skip to content

Commit

Permalink
*) crawler/ftp/CrawlWorker.java: better errorhandling
Browse files Browse the repository at this point in the history
git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2503 6c8d7289-2bf4-0310-a012-ef5d649a1542
  • Loading branch information
theli committed Sep 7, 2006
1 parent 7d7f301 commit b445142
Showing 1 changed file with 137 additions and 126 deletions.
263 changes: 137 additions & 126 deletions source/de/anomic/plasma/crawler/ftp/CrawlWorker.java
Expand Up @@ -133,144 +133,155 @@ public Entry load() throws IOException {
String fullPath = this.url.getPath();
int port = this.url.getPort();

// open a connection to the ftp server
if (port == -1) {
ftpClient.exec("open " + host, false);
} else {
ftpClient.exec("open " + host + " " + port, false);
}
if (berr.size() > 0) {
this.log.logInfo("Unable to connect to ftp server " + this.url.getHost() + " hosting URL " + this.url.toString() + "\nErrorlog: " + berr.toString());
addURLtoErrorDB(plasmaCrawlEURL.DENIED_CONNECTION_ERROR);
}

// login to the server
ftpClient.exec("user " + userName + " " + userPwd, false);
if (berr.size() > 0) {
this.log.logInfo("Unable to login to ftp server " + this.url.getHost() + " hosting URL " + this.url.toString() + "\nErrorlog: " + berr.toString());
addURLtoErrorDB(plasmaCrawlEURL.DENIED_SERVER_LOGIN_FAILED);
}

// change transfer mode to binary
ftpClient.exec("binary", false);
if (berr.size() > 0) {
this.log.logInfo("Unable to set the file transfer mode to binary for URL " + this.url.toString() + "\nErrorlog: " + berr.toString());
addURLtoErrorDB(plasmaCrawlEURL.DENIED_SERVER_TRASFER_MODE_PROBLEM);
}

// determine filename and path
String file, path;
if (fullPath.endsWith("/")) {
file = "";
path = fullPath;
} else {
int pos = fullPath.lastIndexOf("/");
if (pos == -1) {
file = fullPath;
path = "/";
} else {
path = fullPath.substring(0,pos+1);
file = fullPath.substring(pos+1);
plasmaHTCache.Entry htCache = null;
try {
// open a connection to the ftp server
if (port == -1) {
ftpClient.exec("open " + host, false);
} else {
ftpClient.exec("open " + host + " " + port, false);
}
}

// testing if the specified file is a directory
if (file.length() > 0) {
ftpClient.exec("cd \"" + path + "\"", false);

// testing if the current name is a directoy
boolean isFolder = ftpClient.isFolder(file);
if (isFolder) {
fullPath = fullPath + "/";
if (berr.size() > 0) {
this.log.logWarning("Unable to connect to ftp server " + this.url.getHost() + " hosting URL " + this.url.toString() + "\nErrorlog: " + berr.toString());
addURLtoErrorDB(plasmaCrawlEURL.DENIED_CONNECTION_ERROR);
return null;
}

// login to the server
ftpClient.exec("user " + userName + " " + userPwd, false);
if (berr.size() > 0) {
this.log.logWarning("Unable to login to ftp server " + this.url.getHost() + " hosting URL " + this.url.toString() + "\nErrorlog: " + berr.toString());
addURLtoErrorDB(plasmaCrawlEURL.DENIED_SERVER_LOGIN_FAILED);
return null;
}

// change transfer mode to binary
ftpClient.exec("binary", false);
if (berr.size() > 0) {
this.log.logWarning("Unable to set the file transfer mode to binary for URL " + this.url.toString() + "\nErrorlog: " + berr.toString());
addURLtoErrorDB(plasmaCrawlEURL.DENIED_SERVER_TRASFER_MODE_PROBLEM);
return null;
}

// determine filename and path
String file, path;
if (fullPath.endsWith("/")) {
file = "";
this.url = new URL(this.url,fullPath);
path = fullPath;
} else {
int pos = fullPath.lastIndexOf("/");
if (pos == -1) {
file = fullPath;
path = "/";
} else {
path = fullPath.substring(0,pos+1);
file = fullPath.substring(pos+1);
}
}

// testing if the specified file is a directory
if (file.length() > 0) {
ftpClient.exec("cd \"" + path + "\"", false);

// testing if the current name is a directoy
boolean isFolder = ftpClient.isFolder(file);
if (isFolder) {
fullPath = fullPath + "/";
file = "";
this.url = new URL(this.url,fullPath);
}
}
}

// creating a cache file object
File cacheFile = this.cacheManager.getCachePath(this.url);

// TODO: aborting download if content is to long ...

// TODO: invalid file path check

// testing if the file already exists
if (cacheFile.isFile()) {
// delete the file if it already exists
this.cacheManager.deleteFile(this.url);
} else {
// create parent directories
cacheFile.getParentFile().mkdirs();
}

String mimeType;
Date fileDate;
plasmaHTCache.Entry htCache = null;
if (file.length() == 0) {
// getting the dirlist
mimeType = "text/html";
fileDate = new Date();

// create a htcache entry
htCache = createCacheEntry(mimeType,fileDate);

// generate the dirlist
StringBuffer dirList = ftpClient.dirhtml(fullPath);

// write it into a file
PrintWriter writer = new PrintWriter(new FileOutputStream(cacheFile),false);
writer.write(dirList.toString());
writer.flush();
writer.close();
} else {
// determine the mimetype of the resource
String extension = plasmaParser.getFileExt(this.url);
mimeType = plasmaParser.getMimeTypeByFileExt(extension);

// if the mimetype and file extension is supported we start to download the file
if ((this.acceptAllContent) || (plasmaParser.supportedContent(plasmaParser.PARSER_MODE_CRAWLER,this.url,mimeType))) {

// TODO: determine the real file date
// creating a cache file object
File cacheFile = this.cacheManager.getCachePath(this.url);

// TODO: aborting download if content is to long ...

// TODO: invalid file path check

// testing if the file already exists
if (cacheFile.isFile()) {
// delete the file if it already exists
this.cacheManager.deleteFile(this.url);
} else {
// create parent directories
cacheFile.getParentFile().mkdirs();
}

String mimeType;
Date fileDate;
if (file.length() == 0) {
// getting the dirlist
mimeType = "text/html";
fileDate = new Date();

// create a htcache entry
htCache = createCacheEntry(mimeType,fileDate);

// change into working directory
ftpClient.exec("cd \"" + fullPath + "\"", false);

// download the remote file
ftpClient.exec("get \"" + file + "\" \"" + cacheFile.getAbsolutePath() + "\"", false);
// generate the dirlist
StringBuffer dirList = ftpClient.dirhtml(fullPath);

if (dirList != null && dirList.length() > 0) try {
// write it into a file
PrintWriter writer = new PrintWriter(new FileOutputStream(cacheFile),false);
writer.write(dirList.toString());
writer.flush();
writer.close();
} catch (Exception e) {
this.log.logInfo("Unable to write dirlist for URL " + this.url.toString());
htCache = null;
}
} else {
// determine the mimetype of the resource
String extension = plasmaParser.getFileExt(this.url);
mimeType = plasmaParser.getMimeTypeByFileExt(extension);

// if the mimetype and file extension is supported we start to download the file
if ((this.acceptAllContent) || (plasmaParser.supportedContent(plasmaParser.PARSER_MODE_CRAWLER,this.url,mimeType))) {

// TODO: determine the real file date
fileDate = new Date();

// create a htcache entry
htCache = createCacheEntry(mimeType,fileDate);

// change into working directory
ftpClient.exec("cd \"" + fullPath + "\"", false);

// download the remote file
ftpClient.exec("get \"" + file + "\" \"" + cacheFile.getAbsolutePath() + "\"", false);
} else {
// if the response has not the right file type then reject file
this.log.logInfo("REJECTED WRONG MIME/EXT TYPE " + mimeType + " for URL " + this.url.toString());
addURLtoErrorDB(plasmaCrawlEURL.DENIED_WRONG_MIMETYPE_OR_EXT);
return null;
}
}

// pass the downloaded resource to the cache manager
if (berr.size() > 0 || htCache == null) {
// if the response has not the right file type then reject file
this.log.logInfo("REJECTED WRONG MIME/EXT TYPE " + mimeType + " for URL " + this.url.toString());
addURLtoErrorDB(plasmaCrawlEURL.DENIED_WRONG_MIMETYPE_OR_EXT);
this.log.logWarning("Unable to download URL " + this.url.toString() + "\nErrorlog: " + berr.toString());
addURLtoErrorDB(plasmaCrawlEURL.DENIED_SERVER_DOWNLOAD_ERROR);

// an error has occured. cleanup
if (cacheFile.exists()) cacheFile.delete();
} else {
// announce the file
this.cacheManager.writeFileAnnouncement(cacheFile);

// enQueue new entry with response header
if (this.profile != null) {
this.cacheManager.push(htCache);
}
}
}

// closing connection
ftpClient.exec("close", false);
ftpClient.exec("exit", false);

// pass the downloaded resource to the cache manager
if (berr.size() > 0 || htCache == null) {
// if the response has not the right file type then reject file
this.log.logInfo("Unable to download URL " + this.url.toString() + "\nErrorlog: " + berr.toString());
addURLtoErrorDB(plasmaCrawlEURL.DENIED_SERVER_DOWNLOAD_ERROR);

// an error has occured. cleanup
if (cacheFile.exists()) cacheFile.delete();
} else {
// announce the file
this.cacheManager.writeFileAnnouncement(cacheFile);

// enQueue new entry with response header
if (this.profile != null) {
this.cacheManager.push(htCache);
}
}

return htCache;
return htCache;
} finally {
// closing connection
ftpClient.exec("close", false);
ftpClient.exec("exit", false);
}
}


Expand Down

0 comments on commit b445142

Please sign in to comment.