Skip to content

Commit

Permalink
Add logic to only extract *part* of an archive.
Browse files Browse the repository at this point in the history
This is done by treating "foo.mfn a b c" as "extract only files matching
a, b, or c from foo.mfn".

I *think* this is a good idea, but am willing to reconsider.
  • Loading branch information
sz3 committed Feb 11, 2020
1 parent ff17ef7 commit 4e09876
Show file tree
Hide file tree
Showing 3 changed files with 84 additions and 14 deletions.
30 changes: 28 additions & 2 deletions pog/lib/blob_store.py
Expand Up @@ -4,6 +4,7 @@
from tempfile import NamedTemporaryFile
from urllib.parse import urlparse

from collections import defaultdict
from pog.fs.pogfs import get_cloud_fs


Expand All @@ -25,7 +26,30 @@ class download_list():
def __init__(self, *args, **kwargs):
self.filenames = _flatten(*args)
self.fs_info = kwargs.get('fs_info', [])
self.yield_fs_info = kwargs.get('yield_fs_info', False)
self.partials = {}

# `extract` mode does two things:
# 1. iterator returns a tuple with fs_info in it
# 2. consumes non-mfn arguments and returns them with the preceding mfn, if there is one
self.extract = kwargs.get('extract', False)
if self.extract:
self.filenames, self.partials = self._determine_partials(self.filenames)

def _determine_partials(self, filenames):
if not self.extract:
return

partials = defaultdict(set)
current_mfn = None
for f in list(filenames):
if f.endswith('.mfn'):
current_mfn = f
continue
if not current_mfn:
continue
partials[current_mfn].add(f)
filenames.remove(f)
return filenames, dict(partials)

def __iter__(self):
self.it = iter(self.filenames)
Expand All @@ -38,8 +62,10 @@ def __next__(self):
pass
try:
filename = next(self.it)
partials = self.partials.get(filename)

filename, self.tempfile, fs_info = self._download_if_necessary(filename, *self.fs_info)
return filename if not self.yield_fs_info else (filename, fs_info)
return filename if not self.extract else (filename, fs_info, partials)
except StopIteration:
raise

Expand Down
22 changes: 13 additions & 9 deletions pog/pog.py
Expand Up @@ -13,16 +13,18 @@
pog (-h | --help)
Examples:
python -m pog.pog /path/to/file1 /path/to/file2 ...
python -m pog.pog --chunk-size=50MB bigfile
python -m pog.pog --decrypt 2019-10-31T12:34:56.012345.mfn
python -m pog.pog /path/to/file1 /path/to/file2
pog /path/to/file1 /path/to/file2
pog --chunk-size=50MB bigfile
pog --decrypt 2019-10-31T12:34:56.012345.mfn
python -m pog.pog /home/myfile.original > outputs.txt
python -m pog.pog --decrypt $(cat outputs.txt) > myfile.copy
pog /home/myfile.original > outputs.txt
pog --decrypt $(cat outputs.txt) > myfile.copy
python -m pog.pog --encryption-keyfile=pki.encrypt /path/to/file*
python -m pog.pog --decryption-keyfile=pki.decrypt --consume 2019-10-31T12:34:56.012345.mfn
python -m pog.pog --encryption-keyfile=pki.encrypt --dump-manifest-index 2019-*
pog --encryption-keyfile=pki.encrypt /path/to/file*
pog --encryption-keyfile=pki.encrypt --dump-manifest-index 2019-*
pog --decryption-keyfile=pki.decrypt s3://mybucket/2019-10-31T12:34:56.012345.mfn
pog --decryption-keyfile=pki.decrypt --consume 2019-10-31T12:34:56.012345.mfn
Options:
-h --help Show this help.
Expand Down Expand Up @@ -345,11 +347,13 @@ def dump_manifest(self, *inputs, show_filenames=True):
print(blob)

def decrypt(self, *inputs):
for filename, fs_info in download_list(inputs, yield_fs_info=True):
for filename, fs_info, partials in download_list(inputs, extract=True):
decompressor = zstd.ZstdDecompressor()
if filename.endswith('.mfn'):
mfn = self.load_manifest(filename)
for og_filename, info in mfn.items():
if partials and og_filename not in partials:
continue
copy_filename = path.normpath('./{}'.format(og_filename))
dir_path = path.dirname(copy_filename)
if dir_path:
Expand Down
46 changes: 43 additions & 3 deletions tests/test_blob_store.py
Expand Up @@ -50,29 +50,69 @@ def test_download_mfns(self, mock_s3, mock_b2):

@patch('pog.fs.pogfs.b2fs', autoSpec=True)
@patch('pog.fs.pogfs.s3fs', autoSpec=True)
def test_download_yield_fs_info(self, mock_s3, mock_b2):
def test_download_extract(self, mock_s3, mock_b2):
mock_b2.return_value = mock_b2
mock_s3.return_value = mock_s3

local_paths = []
fs_infos = []
for f, fs_info in download_list('boring.mfn', 's3://bucket1/file.mfn', 'b2://bucket2/another.mfn',
yield_fs_info=True):
partials = []
for f, fs_info, prtl in download_list('boring.mfn', 's3://bucket1/file.mfn', 'b2://bucket2/another.mfn',
extract=True):
local_paths.append(f)
fs_infos.append(fs_info)
partials.append(prtl)
if f != 'boring.mfn': # no tempfile download for local file
self.assertTrue(path.exists(f))

self.assertEqual(local_paths[0], 'boring.mfn')
self.assertEqual(fs_infos[0], [])
self.assertEqual(partials[0], None)

mock_s3.assert_called_once_with('bucket1')
mock_s3.download_file.assert_any_call(local_paths[1], 'file.mfn')
self.assertEqual(fs_infos[1], ('s3', 'bucket1'))
self.assertEqual(partials[1], None)

mock_b2.assert_called_once_with('bucket2')
mock_b2.download_file.assert_any_call(local_paths[2], 'another.mfn')
self.assertEqual(fs_infos[2], ('b2', 'bucket2'))
self.assertEqual(partials[2], None)

# should clean up
for f in local_paths:
self.assertFalse(path.exists(f))

@patch('pog.fs.pogfs.b2fs', autoSpec=True)
@patch('pog.fs.pogfs.s3fs', autoSpec=True)
def test_download_extract_with_partials(self, mock_s3, mock_b2):
mock_b2.return_value = mock_b2
mock_s3.return_value = mock_s3

local_paths = []
fs_infos = []
partials = []
for f, fs_info, prtl in download_list('boring.mfn', 'file1', 'file2', 's3://bucket1/file.mfn', 'dir/file',
'b2://bucket2/another.mfn', extract=True):
local_paths.append(f)
fs_infos.append(fs_info)
partials.append(prtl)
if f != 'boring.mfn': # no tempfile download for local file
self.assertTrue(path.exists(f))

self.assertEqual(local_paths[0], 'boring.mfn')
self.assertEqual(fs_infos[0], [])
self.assertEqual(partials[0], {'file1', 'file2'})

mock_s3.assert_called_once_with('bucket1')
mock_s3.download_file.assert_any_call(local_paths[1], 'file.mfn')
self.assertEqual(fs_infos[1], ('s3', 'bucket1'))
self.assertEqual(partials[1], {'dir/file'})

mock_b2.assert_called_once_with('bucket2')
mock_b2.download_file.assert_any_call(local_paths[2], 'another.mfn')
self.assertEqual(fs_infos[2], ('b2', 'bucket2'))
self.assertEqual(partials[2], None)

# should clean up
for f in local_paths:
Expand Down

0 comments on commit 4e09876

Please sign in to comment.