Skip to content
Permalink
Browse files

Fixed raw IPV6 addresses snapshots read/write on FAT32 and NTFS fs

Fixes issue #225
  • Loading branch information...
luccioman committed Sep 12, 2018
1 parent 1054822 commit 7adbd1f87df67cc2b2bbeb61977bc65ca73acc39
Showing with 77 additions and 15 deletions.
  1. +63 −13 source/net/yacy/crawler/data/Snapshots.java
  2. +14 −2 source/net/yacy/crawler/data/Transactions.java
@@ -25,6 +25,9 @@
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Collection;
@@ -194,7 +197,7 @@ public Revisions getRevisions(String urlhash) {

/**
* list the snapshots for a given host name
* @param hostport the <host>.<port> identifier for the domain
* @param hostport the <host>.<port> identifier for the domain (with the same format as applied by the Snapshots.pathToHostPortDir() function)
* @param depth restrict the result to the given depth or if depth == -1 do not restrict to a depth
* @return a map with a set for each depth in the domain of the host name
*/
@@ -244,8 +247,7 @@ public int listIDsSize(final String hostport, final int depth) {
public File definePath(final DigestURL url, final int depth, final Date date, final String ext) {
String id = ASCII.String(url.hash());
String ds = GenericFormatter.SHORT_MINUTE_FORMATTER.format(date);
File path = new File(pathToShard(url, depth), id + "." + ds + "." + ext);
return path;
return new File(pathToShard(url, depth), id + "." + ds + "." + ext);
}

/**
@@ -268,7 +270,7 @@ public void announceStorage(final DigestURL url, final int depth, final Date dat

/**
* Delete information about the storage of a snapshot to the Snapshot-internal index.
* The actual deletion of files in the target directory must be done elsewehre, this method does not store the snapshot files.
* The actual deletion of files in the target directory must be done elsewhere, this method does not store the snapshot files.
* @param url
* @param depth
* @param date
@@ -335,18 +337,30 @@ public void announceStorage(final DigestURL url, final int depth, final Date dat
}
}
if (host != null && depth == null) {
String hostport = pathToHostPortDir(host,80);
String hostport = pathToHostPortDir(host, 80);
TreeMap<Integer, TreeSet<String>> depthIdsMap = this.directory.get(hostport);
if (depthIdsMap != null) loop: for (Map.Entry<Integer, TreeSet<String>> depthIds: depthIdsMap.entrySet()) {
for (String id: depthIds.getValue()) {
dateIdResult.put(id, new String[]{hostport, Integer.toString(depthIds.getKey())});
if (order == Order.ANY && dateIdResult.size() >= maxcount) break loop;
}
if(depthIdsMap == null && isIpv6AddrHost(host)) {
/* If the host is a raw IPV6 address, we check also if a snapshot was recorded with the old format (without percent-encoding) */
hostport = pathToHostPortDir(host, 80, false);
depthIdsMap = this.directory.get(hostport);
}
if (depthIdsMap != null) {
loop: for (Map.Entry<Integer, TreeSet<String>> depthIds: depthIdsMap.entrySet()) {
for (String id: depthIds.getValue()) {
dateIdResult.put(id, new String[]{hostport, Integer.toString(depthIds.getKey())});
if (order == Order.ANY && dateIdResult.size() >= maxcount) break loop;
}
}
}
}
if (host != null && depth != null) {
String hostport = pathToHostPortDir(host,80);
String hostport = pathToHostPortDir(host, 80);
TreeMap<Integer, TreeSet<String>> domaindepth = this.directory.get(hostport);
if(domaindepth == null && isIpv6AddrHost(host)) {
/* If the host is a raw IPV6 address, we check also if a snapshot was recorded with the old format (without percent-encoding) */
hostport = pathToHostPortDir(host, 80, false);
domaindepth = this.directory.get(hostport);
}
if (domaindepth != null) {
TreeSet<String> ids = domaindepth.get(depth);
if (ids != null) loop: for (String id: ids) {
@@ -430,6 +444,10 @@ public int getDepth(final DigestURL url, final Fulltext fulltext) {
public Collection<File> findPaths(final DigestURL url, final int depth, final String ext) {
String id = ASCII.String(url.hash());
File pathToShard = pathToShard(url, depth);
if(!pathToShard.exists() && isIpv6AddrHost(url.getHost())) {
/* If the host is a raw IPV6 address, we check also if a snapshot was recorded with the old format (without percent-encoding) */
pathToShard = pathToShard(pathToHostPortDir(url.getHost(), url.getPort(), false), ASCII.String(url.hash()), depth);
}
String[] list = pathToShard.exists() && pathToShard.isDirectory() ? pathToShard.list() : null; // may be null if path does not exist
ArrayList<File> paths = new ArrayList<>();
if (list != null) {
@@ -450,9 +468,41 @@ private File pathToShard(final String hostport, final String urlhash, final int
File pathToShard = new File(pathToDepthDir, pathToShard(urlhash));
return pathToShard;
}


/**
* @param host a domain name or IP address
* @return true when the host string is a raw IPV6 address (with square brackets)
*/
private boolean isIpv6AddrHost(final String host) {
return (host != null && host.startsWith("[") && host.endsWith("]") && host.contains(":"));
}

/**
* @param host a domain name or IP address
* @param port a port number
* @return a representation of the host and port encoding IPV6 addresses for better support accross file systems (notably FAT or NTFS)
*/
private String pathToHostPortDir(final String host, final int port) {
return host + "." + port;
return pathToHostPortDir(host, port, true);
}

/**
* @param host a domain name or IP address
* @param port a port number
* @param encodeIpv6 when true, encode the host for better support accross file systems (notably FAT or NTFS)
* @return a representation of the host and port
*/
private String pathToHostPortDir(final String host, final int port, final boolean encodeIpv6) {
String encodedHost = host;
if(encodeIpv6 && isIpv6AddrHost(host)) {
/* Percent-encode the host name when it is an IPV6 address, as the ':' character is illegal in a file name on MS Windows FAT32 and NTFS file systems */
try {
encodedHost = URLEncoder.encode(host, StandardCharsets.UTF_8.name());
} catch (final UnsupportedEncodingException e) {
/* This should not happen has UTF-8 encoding support is required for any JVM implementation */
}
}
return encodedHost + "." + port;
}

private String pathToDepthDir(final int depth) {
@@ -35,6 +35,7 @@
import java.util.TreeMap;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;

import org.apache.solr.common.SolrInputDocument;
@@ -84,6 +85,15 @@ public static void init(File dir) {
archiveDir = new File(transactionDir, State.ARCHIVE.dirname);
archive = new Snapshots(archiveDir);
}

public static synchronized void migrateIPV6Snapshots() {
executor.shutdown();
try {
executor.awaitTermination(10, TimeUnit.SECONDS);
} catch (final InterruptedException e) {
return;
}
}

/**
* get the number of entries for each of the transaction states
@@ -118,7 +128,7 @@ public static Revisions getRevisions(final State state, final String urlhash) {

/**
* list the snapshots for a given host name
* @param hostport the <host>.<port> identifier for the domain
* @param hostport the <host>.<port> identifier for the domain (with the same format as applied by the Snapshots.pathToHostPortDir() function).
* @param depth restrict the result to the given depth or if depth == -1 do not restrict to a depth
* @param state the wanted transaction state, State.INVENTORY, State.ARCHIVE or State.ANY
* @return a map with a set for each depth in the domain of the host name
@@ -199,7 +209,9 @@ public static boolean store(final DigestURL url, final Date date, final int dept
// CLEAN UP OLD DATA (if wanted)
Collection<File> oldPaths = Transactions.findPaths(url, depth, null, Transactions.State.INVENTORY);
if (replaceOld && oldPaths != null) {
for (File oldPath: oldPaths) oldPath.delete();
for (File oldPath: oldPaths) {
oldPath.delete();
}
}

// STORE METADATA FOR THE IMAGE

0 comments on commit 7adbd1f

Please sign in to comment.
You can’t perform that action at this time.