Skip to content

Commit

Permalink
New version of walk for storage roots and objects
Browse files Browse the repository at this point in the history
  • Loading branch information
zimeon committed Jul 24, 2020
1 parent 73c5675 commit 5431676
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 39 deletions.
18 changes: 14 additions & 4 deletions ocfl/pyfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,20 +58,25 @@ def open_fs(fs_url, **kwargs):
acl=parse_result.params.get("acl", None),
cache_control=parse_result.params.get("cache_control", None),
strict=strict)
s3fs.getinfo = s3fs._getinfo # Patch in version that doesn't check parent directory
s3fs.getinfo = s3fs._getinfo # Patch in version of method that doesn't check parent directory
return s3fs
else:
# Non-S3 URL
return fs.open_fs(fs_url, **kwargs)


def walk(f, dir):
def ocfl_walk(f, dir='/', is_storage_root=False):
"""Walk that works on pyfs filesystems including S3 without the need for directory objects.
Assumes that f.getinfo() will work for a file/resource that exists and
that fs.errors.ResourceNotFound might be raised if called on a filesystem
without directories (and no directory objects).
For walking storage roots (is_storage_root=True) then the condition to
descend is:
1) this is the root (dirpaht == '/'), or
2) there are no files in this directory (see https://ocfl.io/1.0/spec/#root-structure)
FIXME - QUICK AND DIRTY HACK, CAN DO BETTER!
"""
if not dir.startswith('/'):
Expand All @@ -83,6 +88,7 @@ def walk(f, dir):
entries = f.listdir(dirpath)
files = []
dirs = []
dirpaths = []
for entry in entries:
entry_path = fs.path.join(dirpath, entry)
is_dir = True
Expand All @@ -91,10 +97,14 @@ def walk(f, dir):
# print(entry_path + " info: " + str(info))
is_dir = info.is_dir
except fs.errors.ResourceNotFound:
pass # Must be a directory
pass # Assume to be a directory
if is_dir:
dirs.append(entry)
stack.append(entry_path)
dirpaths.append(entry_path)
else:
files.append(entry)
if not is_storage_root or dirpath == '/' or len(files) == 0:
# If this is not the storage root itself and there are files
# present then we should not descend further
stack.extend(dirpaths)
yield(dirpath, dirs, files)
37 changes: 4 additions & 33 deletions ocfl/store.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
should enable application beyond the operating system filesystem.
"""
import fs
import fs.walk
import hashlib
import json
import re
Expand All @@ -18,37 +17,10 @@
from .disposition import get_dispositor
from .namaste import find_namastes, Namaste
from .object import Object
from .pyfs import open_fs
from .pyfs import open_fs, ocfl_walk
from .validator import Validator


class StoreWalker(fs.walk.Walker):
"""Walker tailered to an OCFL Storage Root."""

def check_open_dir(self, pyfs, path, info):
"""Check to see whether directory under the storage root should be descended into.
The condition to descend is:
1) Continue from the roor (path==/), or
2) Continue if there are no files (see https://ocfl.io/1.0/spec/#root-structure)
Arguments:
fs (FS): A filesystem instance.
path (str): Path to directory.
info (Info): A resource info object for the directory.
Returns:
bool: `True` if the directory should be opened.
"""
descend = True
if path != '/':
for file in pyfs.scandir(path):
if file.is_file:
descend = False
break
return descend


class StoreException(Exception):
"""Exception class for OCFL Storage Root."""

Expand Down Expand Up @@ -173,17 +145,16 @@ def object_paths(self):
Will log any errors seen while traversing the directory tree under the
storage root.
"""
walker = StoreWalker()
for (dirpath, dirs, files) in walker.walk(self.root_fs):
for (dirpath, dirs, files) in ocfl_walk(self.root_fs, is_storage_root=True):
if dirpath == '/':
pass # Ignore files in root
pass # Ignore files in storage root
elif (len(dirs) + len(files)) == 0:
self.traversal_error("Empty directory %s" % (dirpath))
elif len(files) == 0:
pass # Just an intermediate directory
else:
# Is this directory an OCFL object? Look for any 0= file.
zero_eqs = [file.name for file in files if file.is_file and file.name.startswith('0=')]
zero_eqs = [file for file in files if file.startswith('0=')]
if len(zero_eqs) > 1:
self.traversal_error("Multiple 0= declaration files in %s, ignoring" % (dirpath))
elif len(zero_eqs) == 1:
Expand Down
4 changes: 2 additions & 2 deletions ocfl/validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from .digest import file_digest, normalized_digest
from .inventory_validator import InventoryValidator
from .namaste import find_namastes, NamasteException
from .pyfs import open_fs, walk
from .pyfs import open_fs, ocfl_walk
from .validation_logger import ValidationLogger
from .w3c_datetime import str_to_datetime

Expand Down Expand Up @@ -241,7 +241,7 @@ def validate_content(self, inventory, version_dirs):
# Check content_directory
content_path = fs.path.join(version_dir, self.content_directory)
num_content_files_in_version = 0
for dirpath, dirs, files in walk(self.obj_fs, content_path):
for dirpath, dirs, files in ocfl_walk(self.obj_fs, content_path):
if dirpath != '/' + content_path and (len(dirs) + len(files)) == 0:
self.log.error("E024", where=version_dir, path=dirpath)
for file in files:
Expand Down

0 comments on commit 5431676

Please sign in to comment.