Skip to content

Commit

Permalink
Add S3 resolve_content
Browse files Browse the repository at this point in the history
  • Loading branch information
tools4origins committed Dec 16, 2019
1 parent 97af059 commit bdd1ec1
Showing 1 changed file with 27 additions and 1 deletion.
28 changes: 27 additions & 1 deletion pysparkling/fileio/fs/s3.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import logging

from ...exceptions import FileSystemNotSupported
from ...utils import Tokenizer
from ...utils import Tokenizer, parse_file_uri
from .file_system import FileSystem

log = logging.getLogger(__name__)
Expand Down Expand Up @@ -50,6 +50,8 @@ def __init__(self, file_name):
@classmethod
def _get_conn(cls):
if not cls._conn:
if boto is None:
raise FileSystemNotSupported('S3 not supported. Install "boto".')
cls._conn = boto.connect_s3(**cls.connection_kwargs)
return cls._conn

Expand All @@ -76,6 +78,30 @@ def resolve_filenames(cls, expr):
))
return files

@classmethod
def resolve_content(cls, expr):
scheme, bucket_name, folder_path, pattern = parse_file_uri(expr)

folder_path = folder_path[1:] # Remove leading slash

expr = "{0}{1}".format(folder_path, pattern)
# Match all files inside folders that match expr
pattern_expr = "{0}{1}*".format(expr, "" if expr.endswith("/") else "/")

bucket = cls._get_conn().get_bucket(
bucket_name,
validate=False
)
files = []
for k in bucket.list(prefix=folder_path):
if fnmatch(k.name, expr) or fnmatch(k.name, pattern_expr):
files.append('{0}://{1}/{2}'.format(
scheme,
bucket_name,
k.name,
))
return files

def exists(self):
t = Tokenizer(self.file_name)
t.next('//') # skip scheme
Expand Down

0 comments on commit bdd1ec1

Please sign in to comment.