Permalink
Browse files

Merge remote-tracking branch 'yelp/master'

  • Loading branch information...
2 parents 7f34dfd + c340f09 commit c04b1739ec7eed08b18d25ff61eb2ac0f14814c5 @tarnfeld committed Apr 8, 2013
Showing with 26 additions and 14 deletions.
  1. +5 −6 mrjob/fs/hadoop.py
  2. +3 −2 tests/fs/test_hadoop.py
  3. +18 −6 tests/mockhadoop.py
View
@@ -137,7 +137,6 @@ def ls(self, path_glob):
except CalledProcessError:
raise IOError("Could not ls %s" % path_glob)
- path_index = None
for line in StringIO(stdout):
line = line.rstrip('\r\n')
fields = line.split(' ')
@@ -150,12 +149,12 @@ def ls(self, path_glob):
# Expected lines:
# -rw-r--r-- 3 dave users 3276 2010-01-13 14:00 /foo/bar # HDFS
# -rwxrwxrwx 1 3276 010-01-13 14:00 /foo/bar # S3
+ path_index = None
+ for index, field in enumerate(fields):
+ if len(field) == 5 and field[2] == ':':
+ path_index = (index + 1)
if not path_index:
- for index, field in enumerate(fields):
- if len(field) == 5 and field[2] == ':':
- path_index = (index + 1)
- if not path_index:
- raise IOError("Could not locate path in string '%s'" % line)
+ raise IOError("Could not locate path in string '%s'" % line)
path = line.split(' ', path_index)[-1]
# handle fully qualified URIs from newer versions of Hadoop ls
View
@@ -71,9 +71,10 @@ def test_ls_recurse(self):
def test_ls_s3n(self):
# hadoop fs -lsr doesn't have user and group info when reading from s3
- self.make_mock_file('f')
+ self.make_mock_file('f', 'foo')
+ self.make_mock_file('f3 win', 'foo' * 10)
self.assertItemsEqual(list(self.fs.ls('s3n://bucket/')),
- ['s3n://bucket/f'])
+ ['s3n://bucket/f', 's3n://bucket/f3 win'])
def test_single_space(self):
self.make_mock_file('foo bar')
View
@@ -201,7 +201,7 @@ def hadoop_fs_lsr(stdout, stderr, environ, *args):
"""Implements hadoop fs -lsr."""
hdfs_path_globs = args or ['']
- def ls_line(real_path, scheme, netloc):
+ def ls_line(real_path, scheme, netloc, size=0, max_size=0):
hdfs_path = real_path_to_hdfs_path(real_path, environ)
# we could actually implement ls here, but mrjob only cares about
@@ -221,9 +221,12 @@ def ls_line(real_path, scheme, netloc):
if scheme and environ.get('MOCK_HADOOP_LS_RETURNS_FULL_URIS'):
hdfs_path = '%s://%s%s' % (scheme, netloc, hdfs_path)
+ # figure out the padding
+ size = str(size).rjust(len(str(max_size)))
+
return (
- '%srwxrwxrwx - %s 18321 2010-10-01 15:16 %s' %
- (file_type, user_and_group, hdfs_path))
+ '%srwxrwxrwx - %s %s 2010-10-01 15:16 %s' %
+ (file_type, user_and_group, size, hdfs_path))
failed = False
for hdfs_path_glob in hdfs_path_globs:
@@ -233,6 +236,10 @@ def ls_line(real_path, scheme, netloc):
real_path_glob = hdfs_path_to_real_path(hdfs_path_glob, environ)
real_paths = glob.glob(real_path_glob)
+
+ paths = []
+ max_size = 0
+
if not real_paths:
print >> stderr, (
'lsr: Cannot access %s: No such file or directory.' %
@@ -242,12 +249,17 @@ def ls_line(real_path, scheme, netloc):
for real_path in real_paths:
if os.path.isdir(real_path):
for dirpath, dirnames, filenames in os.walk(real_path):
- print >> stdout, ls_line(dirpath, scheme, netloc)
+ paths.append((dirpath, scheme, netloc, 0))
for filename in filenames:
path = os.path.join(dirpath, filename)
- print >> stdout, ls_line(path, scheme, netloc)
+ size = os.path.getsize(path)
+ max_size = size if size > max_size else max_size
+ paths.append((path, scheme, netloc, size))
else:
- print >> stdout, ls_line(real_path, scheme, netloc)
+ paths.append((real_path, scheme, netloc, 0))
+
+ for path in paths:
+ print >> stdout, ls_line(*path + (max_size,))
if failed:
return -1

0 comments on commit c04b173

Please sign in to comment.