Skip to content
Permalink
Browse files

Fixed crawl queue folder naming for IPv6 hosts on MS Windows filesystems

As reported by @vikulin in issue #187, crawling websites using a raw
IPv6 address as host name in their URL failed when running on Microsoft
Windows platforms (FAT32 or NTFS filesystems) when YaCy crawler created
the crawl queue folder, as the ':' character which is part of an IPV6
address is forbidden on these filesystems.
  • Loading branch information...
luccioman committed Aug 11, 2018
1 parent 0e976e9 commit 21ad9435ec6bccaea87e91f4d6111d47b4458a26
Showing with 31 additions and 3 deletions.
  1. +31 −3 source/net/yacy/crawler/HostQueue.java
@@ -20,10 +20,16 @@

package net.yacy.crawler;

import static net.yacy.kelondro.util.FileUtils.deletedelete;

import java.io.File;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.lang.reflect.Array;
import java.net.MalformedURLException;
import java.net.URLDecoder;
import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.ConcurrentModificationException;
import java.util.Iterator;
@@ -51,7 +57,6 @@
import net.yacy.kelondro.index.Row;
import net.yacy.kelondro.index.RowHandleSet;
import net.yacy.kelondro.table.Table;
import static net.yacy.kelondro.util.FileUtils.deletedelete;
import net.yacy.kelondro.util.kelondroException;
import net.yacy.repository.Blacklist.BlacklistType;
import net.yacy.search.Switchboard;
@@ -93,7 +98,19 @@ public HostQueue (
this.port = hostUrl.getPort();
this.hostHash = hostUrl.hosthash(); // hosthash is calculated by protocol + hostname + port
// hostName/port included just for human readability (& historically), "-#" marker used to define begin of hosthash in directoryname
this.hostPath = new File(hostsPath, this.hostName + "-#"+ this.hostHash + "." + this.port);
if(this.hostName.startsWith("[") && this.hostName.endsWith("]") && this.hostName.contains(":")) {
/* Percent-encode the host name when it is an IPV6 address, as the ':' character is illegal in a file name on MS Windows FAT32 and NTFS file systems */
File path;
try {
path = new File(hostsPath, URLEncoder.encode(this.hostName, StandardCharsets.UTF_8.name()) + "-#"+ this.hostHash + "." + this.port);
} catch (final UnsupportedEncodingException e) {
/* This should not happen has UTF-8 encoding support is required for any JVM implementation */
path = new File(hostsPath, this.hostName + "-#"+ this.hostHash + "." + this.port);
}
this.hostPath = path;
} else {
this.hostPath = new File(hostsPath, this.hostName + "-#"+ this.hostHash + "." + this.port);
}
init();
}

@@ -120,7 +137,18 @@ public HostQueue (
this.port = Integer.parseInt(filename.substring(pdot + 1)); // consider "host.com" contains dot but no required port -> will throw exception
int p1 = filename.lastIndexOf("-#");
if (p1 >= 0) {
this.hostName = filename.substring(0,p1);
String hostNameInFile = filename.substring(0,p1);
if(hostNameInFile.startsWith("%5B") && hostNameInFile.endsWith("%5D") && hostNameInFile.contains("%3A")) {
/* Host name is a percent-encoded IPV6 address */
try {
hostNameInFile = URLDecoder.decode(hostNameInFile, StandardCharsets.UTF_8.name());
} catch (final UnsupportedEncodingException | RuntimeException ignored) {
/* This should not happen has UTF-8 encoding support is required for any JVM implementation */
}
this.hostName = hostNameInFile;
} else {
this.hostName = hostNameInFile;
}
this.hostHash = filename.substring(p1+2,pdot);
} else throw new RuntimeException("hostPath name must contain -# followd by hosthash: " + filename);
init();

0 comments on commit 21ad943

Please sign in to comment.
You can’t perform that action at this time.