Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add option to list all objects under an S3 location prefix. #3221

Closed
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Expand Up @@ -62,6 +62,7 @@ public class HiveS3Config
private PrestoS3AclType s3AclType = PrestoS3AclType.PRIVATE;
private boolean skipGlacierObjects;
private boolean requesterPaysEnabled;
private boolean usePseudoDirectories = true;

public String getS3AwsAccessKey()
{
Expand Down Expand Up @@ -457,4 +458,16 @@ public HiveS3Config setRequesterPaysEnabled(boolean requesterPaysEnabled)
this.requesterPaysEnabled = requesterPaysEnabled;
return this;
}

public boolean isUsePseudoDirectories()
{
return usePseudoDirectories;
}

@Config("hive.s3.use-pseudo-directories")
public HiveS3Config setUsePseudoDirectories(boolean usePseudoDirectories)
{
this.usePseudoDirectories = usePseudoDirectories;
return this;
}
}
Expand Up @@ -52,6 +52,7 @@
import static io.prestosql.plugin.hive.s3.PrestoS3FileSystem.S3_STORAGE_CLASS;
import static io.prestosql.plugin.hive.s3.PrestoS3FileSystem.S3_USER_AGENT_PREFIX;
import static io.prestosql.plugin.hive.s3.PrestoS3FileSystem.S3_USE_INSTANCE_CREDENTIALS;
import static io.prestosql.plugin.hive.s3.PrestoS3FileSystem.S3_USE_PSEUDO_DIRECTORIES;

public class PrestoS3ConfigurationInitializer
implements ConfigurationInitializer
Expand Down Expand Up @@ -86,6 +87,7 @@ public class PrestoS3ConfigurationInitializer
private final String signerClass;
private final boolean requesterPaysEnabled;
private final boolean skipGlacierObjects;
private final boolean usePseudoDirectories;

@Inject
public PrestoS3ConfigurationInitializer(HiveS3Config config)
Expand Down Expand Up @@ -120,6 +122,7 @@ public PrestoS3ConfigurationInitializer(HiveS3Config config)
this.aclType = config.getS3AclType();
this.skipGlacierObjects = config.isSkipGlacierObjects();
this.requesterPaysEnabled = config.isRequesterPaysEnabled();
this.usePseudoDirectories = config.isUsePseudoDirectories();
}

@Override
Expand Down Expand Up @@ -178,5 +181,6 @@ public void initializeConfiguration(Configuration config)
config.set(S3_ACL_TYPE, aclType.name());
config.setBoolean(S3_SKIP_GLACIER_OBJECTS, skipGlacierObjects);
config.setBoolean(S3_REQUESTER_PAYS_ENABLED, requesterPaysEnabled);
config.setBoolean(S3_USE_PSEUDO_DIRECTORIES, usePseudoDirectories);
}
}
Expand Up @@ -166,6 +166,7 @@ public class PrestoS3FileSystem
public static final String S3_SKIP_GLACIER_OBJECTS = "presto.s3.skip-glacier-objects";
public static final String S3_REQUESTER_PAYS_ENABLED = "presto.s3.requester-pays.enabled";
public static final String S3_STORAGE_CLASS = "presto.s3.storage-class";
public static final String S3_USE_PSEUDO_DIRECTORIES = "presto.s3.use-pseudo-directories";

static final String S3_DIRECTORY_OBJECT_CONTENT_TYPE = "application/x-directory";

Expand Down Expand Up @@ -201,6 +202,7 @@ public class PrestoS3FileSystem
private boolean skipGlacierObjects;
private boolean requesterPaysEnabled;
private PrestoS3StorageClass s3StorageClass;
private boolean usePseudoDirectories;

@Override
public void initialize(URI uri, Configuration conf)
Expand Down Expand Up @@ -247,6 +249,7 @@ public void initialize(URI uri, Configuration conf)
this.skipGlacierObjects = conf.getBoolean(S3_SKIP_GLACIER_OBJECTS, defaults.isSkipGlacierObjects());
this.requesterPaysEnabled = conf.getBoolean(S3_REQUESTER_PAYS_ENABLED, defaults.isRequesterPaysEnabled());
this.s3StorageClass = conf.getEnum(S3_STORAGE_CLASS, defaults.getS3StorageClass());
this.usePseudoDirectories = conf.getBoolean(S3_USE_PSEUDO_DIRECTORIES, defaults.isUsePseudoDirectories());

ClientConfiguration configuration = new ClientConfiguration()
.withMaxErrorRetry(maxErrorRetries)
Expand Down Expand Up @@ -527,25 +530,26 @@ private Iterator<LocatedFileStatus> listPrefix(Path path)
ListObjectsV2Request request = new ListObjectsV2Request()
.withBucketName(getBucketName(uri))
.withPrefix(key)
.withDelimiter(PATH_SEPARATOR)
.withRequesterPays(requesterPaysEnabled);
if (usePseudoDirectories) {
request.setDelimiter(PATH_SEPARATOR);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i understand the conditional removal of .setDelimiter(PATH_SEPARATOR) as to list all sub directories in once shot, instead of hoping over dir levels. This is appropriate to directory listing but only for tables/partitions when hive.recursive-directories is on.

  • shouldn’t this new option be enabled only if that option is enabled?
  • shouldn’t we know that FS layer is called from the place where actually hive.recursive-directories matters — ie shouldn’t we modify the calling side to pass information that all subdirs are needed? Then we wouldn’t need the new toggle at all

cc @electrum

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To add to what @findepi said, I think with list caching done in CachingDirectoryLister, we can have an inconsistent (and sometimes incorrect) view of the list results in this approach.
Further, doesn't lazy split generation done currently help in these scenarios?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@findepi I reworked this to use hive.recursive-directories to control the behavior. Much cleaner this way.

@rohangarg - can you elaborate on when this could lead to an inconsistent list, and how thats different than walking the directories? I'm afraid I may be missing something there.

You're right that lazy split generation is good enough for most cases. The performance impact looks like it's significant when A) the number of subdirectories per partition is O(100), and B) the target query time is O(1 second). But in general, reducing the number of calls to S3 should be a good thing.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The example that I was thinking where error can occur is :

  1. set recursive listing to true and query a path - with the feature, the full nested objects are cached in the list located status call (in CachingDirectoryLister's cache obj).
  2. set recursive listing to false and query a path - now while listing the path, the nested objects should not occur in the result whereas the Lister's cache has complete nested listing stored.

}

STATS.newListObjectsCall();
Iterator<ListObjectsV2Result> listings = new AbstractSequentialIterator<ListObjectsV2Result>(s3.listObjectsV2(request))
ListObjectsV2Result listObjectsV2 = s3.listObjectsV2(request);
log.debug(listObjectsV2.getObjectSummaries().toString());
willmostly marked this conversation as resolved.
Show resolved Hide resolved
Iterator<ListObjectsV2Result> listings = new AbstractSequentialIterator<ListObjectsV2Result>(listObjectsV2)
{
@Override
protected ListObjectsV2Result computeNext(ListObjectsV2Result previous)
{
if (!previous.isTruncated()) {
return null;
}

request.setContinuationToken(previous.getNextContinuationToken());

return s3.listObjectsV2(request);
}
};

return Iterators.concat(Iterators.transform(listings, this::statusFromListing));
}

Expand Down
Expand Up @@ -63,7 +63,8 @@ public void testDefaults()
.setS3UserAgentPrefix("")
.setS3AclType(PrestoS3AclType.PRIVATE)
.setSkipGlacierObjects(false)
.setRequesterPaysEnabled(false));
.setRequesterPaysEnabled(false)
.setUsePseudoDirectories(true));
}

@Test
Expand Down Expand Up @@ -100,6 +101,7 @@ public void testExplicitPropertyMappings()
.put("hive.s3.upload-acl-type", "PUBLIC_READ")
.put("hive.s3.skip-glacier-objects", "true")
.put("hive.s3.requester-pays.enabled", "true")
.put("hive.s3.use-pseudo-directories", "false")
.build();

HiveS3Config expected = new HiveS3Config()
Expand Down Expand Up @@ -132,7 +134,8 @@ public void testExplicitPropertyMappings()
.setS3UserAgentPrefix("user-agent-prefix")
.setS3AclType(PrestoS3AclType.PUBLIC_READ)
.setSkipGlacierObjects(true)
.setRequesterPaysEnabled(true);
.setRequesterPaysEnabled(true)
.setUsePseudoDirectories(false);

assertFullMapping(properties, expected);
}
Expand Down