Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1300 6c8d7289-2bf4-0310-a012-ef5d649a1542
  • Loading branch information
borg-0300 committed Jan 9, 2006
1 parent c3284c2 commit 3abd843
Showing 1 changed file with 5 additions and 2 deletions.
7 changes: 5 additions & 2 deletions source/de/anomic/htmlFilter/htmlFilterContentScraper.java
Expand Up @@ -160,18 +160,21 @@ public static String urlNormalform(URL url) {
} else if (url.getProtocol().equals("https")) {
if (url.getPort() < 0 || url.getPort() == 443) { defaultPort = true; }
}
String path = url.getFile();
String path = url.getPath();

// (this is different from previous normal forms where a '/' must not appear in root paths; here it must appear. Makes everything easier.)
if (path.length() == 0 || path.charAt(0) != '/') path = "/" + path;

Pattern pathPattern = Pattern.compile("(/[^/\\.]+/)[.]{2}(?=/)|/\\.(?=/)|/(?=/)");
Matcher matcher = pathPattern.matcher(path);
while (matcher.find()) {
path = matcher.replaceAll("");
matcher.reset(path);
}

String query = url.getQuery().replaceAll("[\"\\/:*?<>|]", "_");
if (query != null) { path = path.concat("_").concat(query); }

if (defaultPort) return url.getProtocol() + "://" + url.getHost() + path;
return url.getProtocol() + "://" + url.getHost() + ":" + url.getPort() + path;
}
Expand Down

0 comments on commit 3abd843

Please sign in to comment.