Skip to content

Commit

Permalink
BUGFIX for URLs how "/../" ...;
Browse files Browse the repository at this point in the history
new port handling;

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1271 6c8d7289-2bf4-0310-a012-ef5d649a1542
  • Loading branch information
borg-0300 committed Dec 30, 2005
1 parent 9cce3c5 commit b95c5d5
Showing 1 changed file with 111 additions and 34 deletions.
145 changes: 111 additions & 34 deletions source/de/anomic/plasma/plasmaHTCache.java
Expand Up @@ -65,6 +65,9 @@ the class shall also be used to do a cache-cleaning and index creation
import java.util.LinkedList;
import java.util.Map;
import java.util.TreeMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.http.httpHeader;
import de.anomic.http.httpc;
Expand Down Expand Up @@ -419,20 +422,10 @@ public static boolean noIndexingURL(String urlString) {
return plasmaParser.mediaExtContains(urlString);
}

/**
* this method creates from a given host and path a cache path
* from a given host (which may also be an IPv4 - number, but not IPv6 or
* a domain; all without leading 'http://') and a path (which must start
* with a leading '/', and may also end in an '/') a path to a file
* in the file system with root as given in cachePath is constructed
* it will also be ensured, that the complete path exists; if necessary
* that path will be generated
* @return new File
*/
public File getCachePath(URL url) {
/* public File getCachePath(URL url) {
// this.log.logFinest("plasmaHTCache: getCachePath: IN=" + url.toString());
String remotePath = url.getFile();
if (!remotePath.startsWith("/")) { remotePath = "/" + remotePath; }
if (!remotePath.startsWith("/")) { remotePath = "/" + remotePath; }
if (remotePath.endsWith("/")) { remotePath = remotePath + "ndx"; }
remotePath = remotePath.replaceAll("[?&:]", "_"); // yes this is not reversible, but that is not needed
int port = url.getPort();
Expand All @@ -446,21 +439,9 @@ public File getCachePath(URL url) {
} else {
return new File(this.cachePath, url.getHost() + "!" + port + remotePath);
}
/* File path;
if (port == 80) {
path = new File(this.cachePath, url.getHost() + remotePath);
} else {
path = new File(this.cachePath, url.getHost() + "!" + port + remotePath);
}
this.log.logFinest("plasmaHTCache: getCachePath: OUT=" + path.toString());
return path;*/
}
} */

/**
* this is the reverse function to getCachePath: it constructs the url as string
* from a given storage path
*/
public static URL getURL(File cachePath, File f) {
/* public static URL getURL(File cachePath, File f) {
// this.log.logFinest("plasmaHTCache: getURL: IN: Path=[" + cachePath + "]");
// this.log.logFinest("plasmaHTCache: getURL: IN: File=[" + f + "]");
String s = f.toString().replace('\\', '/');
Expand All @@ -472,12 +453,6 @@ public static URL getURL(File cachePath, File f) {
s = s.substring(pos + c.length());
while (s.startsWith("/")) s = s.substring(1);
// dieser Block kann spaeter geloescht werden
pos = s.indexOf("+");
if (pos >= 0) {
s = s.substring(0, pos) + ":" + s.substring(pos + 1);
}

pos = s.indexOf("!");
if (pos >= 0) {
String temp = s.substring(pos + 1);
Expand All @@ -498,6 +473,107 @@ public static URL getURL(File cachePath, File f) {
}
}
return null;
}*/

/**
* this method creates from a given host and path a cache path
* from a given host (which may also be an IPv4 - number, but not IPv6 or
* a domain; all without leading 'http://') and a path (which must start
* with a leading '/', and may also end in an '/') a path to a file
* in the file system with root as given in cachePath is constructed
* it will also be ensured, that the complete path exists; if necessary
* that path will be generated
* @return new File
*/
public File getCachePath(URL url) {
// this.log.logFinest("plasmaHTCache: getCachePath: IN=" + url.toString());
String remotePath = url.getFile();
if (!remotePath.startsWith("/")) { remotePath = "/" + remotePath; }
if (remotePath.endsWith("/")) { remotePath = remotePath + "ndx"; }

Pattern pathPattern = Pattern.compile("/\\.\\./");
Matcher matcher = pathPattern.matcher(remotePath);
while (matcher.find()) {
remotePath = matcher.replaceAll("/!!/");
matcher.reset(remotePath);
}

remotePath = remotePath.replaceAll("[?&:]", "_"); // yes this is not reversible, but that is not needed

// only set NO default ports
int port = url.getPort();
if (port >= 0) {
if ((port == 80 && url.getProtocol().equalsIgnoreCase("http" )) ||
(port == 443 && url.getProtocol().equalsIgnoreCase("https")) ||
(port == 21 && url.getProtocol().equalsIgnoreCase("ftp" ))) {
port = -1;
}
}
if (port < 0) {
return new File(this.cachePath, url.getProtocol() + "/" + url.getHost() + remotePath);
} else {
return new File(this.cachePath, url.getProtocol() + "/" + url.getHost() + "!" + port + remotePath);
}
/* File path;
if (port < 0) {
path = new File(this.cachePath, url.getHost() + remotePath);
} else {
path = new File(this.cachePath, url.getHost() + "!" + port + remotePath);
}
this.log.logFinest("plasmaHTCache: getCachePath: OUT=" + path.toString());
return path; */
}

/**
* this is the reverse function to getCachePath: it constructs the url as string
* from a given storage path
*/
public static URL getURL(File cachePath, File f) {
// this.log.logFinest("plasmaHTCache: getURL: IN: Path=[" + cachePath + "] File=[" + f + "]");
final String c = cachePath.toString().replace('\\', '/');
String s = f.toString().replace('\\', '/');

if (s.endsWith("ndx")) { s = s.substring(0, s.length() - 3); }

int pos = s.lastIndexOf(c);
if (pos == 0) {
s = s.substring(pos + c.length());
while (s.startsWith("/")) { s = s.substring(1); }

String protocol = "";
if (s.startsWith("http/")) {
protocol = "http://";
s = s.substring(5);
} else if (s.startsWith("https/")) {
protocol = "https://";
s = s.substring(6);
} else if (s.startsWith("ftp/")) {
protocol = "ftp://";
s = s.substring(4);
} else {
return null;
}

Pattern pathPattern = Pattern.compile("/!!/");
Matcher matcher = pathPattern.matcher(s);
while (matcher.find()) {
s = matcher.replaceAll("/\\.\\./");
matcher.reset(s);
}

pos = s.indexOf("!");
if (pos >= 0) {
s = s.substring(0, pos) + ":" + s.substring(pos + 1);
}

// this.log.logFinest("plasmaHTCache: getURL: OUT=" + s);
try {
return new URL(protocol + s);
} catch (Exception e) {
return null;
}
}
return null;
}

public byte[] loadResource(URL url) {
Expand Down Expand Up @@ -563,8 +639,9 @@ public Entry(Date initDate, int depth, URL url, String name,
plasmaCrawlProfile.entry profile) {

// normalize url
serverLog.logFine("PLASMA", "Entry: URL=" + url.toString());
// serverLog.logFine("PLASMA", "Entry: URL=" + url.toString());
this.nomalizedURLString = htmlFilterContentScraper.urlNormalform(url);

try {
this.url = new URL(this.nomalizedURLString);
} catch (MalformedURLException e) {
Expand Down Expand Up @@ -651,7 +728,7 @@ public String shallStoreCacheForProxy() {

// check status code
if (!(this.responseStatus.startsWith("200") ||
this.responseStatus.startsWith("203"))) { return "bad_status_" + this.responseStatus.substring(0,3); }
this.responseStatus.startsWith("203"))) { return "bad_status_" + this.responseStatus.substring(0,3); }

// check storage location
// sometimes a file name is equal to a path name in the same directory;
Expand Down

0 comments on commit b95c5d5

Please sign in to comment.