Skip to content

Commit

Permalink
ENH: np.DataSource for transparent compression and remote files (#801)
Browse files Browse the repository at this point in the history
* ENH: Use np.DataSource to support compression and remote files

* Zip Solis data, fix Solis from method mode

* BUG: Accept windows line endings in from_Cary

* MAINT: silence h5py warning

* ENH: add DataSource to open to allow remote and externally compressed

Only available with edit_local=False, cannot do transparent file-level compression

* DOC, MAINT: Ensure wt.open accepts path-like

* MAINT: Update from_Cary and from_directory to pathlib

* DOC, MAINT: add pathlib support in file type warning

* DOC: string->path-like in ini

* ENH, DOC: finish making all data from methods pathlib and DataSource

Closes #632

* MAINT: Open in text rather than binary in from_Cary

* DOC: mention local/remote/compressed in docstrings

* Handle url scheme in from methods

* Handle url scheme in from_cary

* Document opening urls/zipped files in data.rst

* DOC: add to documentation on open for remote/compressed files

* DOC, BUG: Document writing jasco, clean up code to fix bugs

* MAINT, BUG: simplify url handling with os.fspath

* DOC: Update from method docs with DataSource

* TST: Add tests of remote url retrieval

* DOC: add comment on google drive download link construction

* MAINT: require newer tidy_headers

* MAINT: Fix cary with fspath instead of urllib

* Fix tensor27 imports

* BUG: PosixPath -> PurePosixPath so it doesn't break on Windows
  • Loading branch information
ksunden authored and untzag committed Dec 2, 2018
1 parent df7a846 commit 86ebafa
Show file tree
Hide file tree
Showing 26 changed files with 344 additions and 5,345 deletions.
2 changes: 1 addition & 1 deletion WrightTools/_group.py
Expand Up @@ -60,7 +60,7 @@ def __init__(self, file=None, parent=None, name=None, **kwargs):
file.require_group(path)
h5py.Group.__init__(self, bind=file[path].id)
self.__n = 0
self.fid = self.file.fid
self.fid = self.file.id
self.natural_name = name
# attrs
self.attrs["class"] = self.class_name
Expand Down
34 changes: 27 additions & 7 deletions WrightTools/_open.py
Expand Up @@ -4,9 +4,12 @@
# --- import -------------------------------------------------------------------------------------


import posixpath
import os
import tempfile
import weakref

import h5py
import numpy as np

from . import collection as wt_collection
from . import data as wt_data
Expand All @@ -21,14 +24,18 @@

# --- functions ----------------------------------------------------------------------------------

_open = open


def open(filepath, edit_local=False):
"""Open any wt5 file, returning the top-level object (data or collection).
Parameters
----------
filepath : string
filepath : path-like
Path to file.
Can be either a local or remote file (http/ftp).
Can be compressed with gz/bz2, decompression based on file name.
edit_local : boolean (optional)
If True, the file itself will be opened for editing. Otherwise, a
copy will be created. Default is False.
Expand All @@ -38,12 +45,25 @@ def open(filepath, edit_local=False):
WrightTools Collection or Data
Root-level object in file.
"""
filepath = os.fspath(filepath)
ds = np.DataSource(None)
if edit_local is False:
tf = tempfile.mkstemp(prefix="", suffix=".wt5")
with _open(tf[1], "w+b") as tff:
with ds.open(str(filepath), "rb") as f:
tff.write(f.read())
filepath = tf[1]
f = h5py.File(filepath)
class_name = f[posixpath.sep].attrs["class"]
name = f[posixpath.sep].attrs["name"]
class_name = f["/"].attrs["class"]
name = f["/"].attrs["name"]
if class_name == "Data":
return wt_data.Data(filepath=filepath, name=name, edit_local=edit_local)
obj = wt_data.Data(filepath=str(filepath), name=name, edit_local=True)
elif class_name == "Collection":
return wt_collection.Collection(filepath=filepath, name=name, edit_local=edit_local)
obj = wt_collection.Collection(filepath=str(filepath), name=name, edit_local=True)
else:
return wt_group.Group(filepath=filepath, name=name, edit_local=edit_local)
obj = wt_group.Group(filepath=str(filepath), name=name, edit_local=True)

if edit_local is False:
setattr(obj, "_tmpfile", tf)
weakref.finalize(obj, obj.close)
return obj
16 changes: 10 additions & 6 deletions WrightTools/collection/_cary.py
Expand Up @@ -4,6 +4,7 @@
# --- import --------------------------------------------------------------------------------------


import os
import pathlib
import re

Expand Down Expand Up @@ -42,7 +43,7 @@ def from_Cary(filepath, name=None, parent=None, verbose=True):
Parameters
----------
filepath : string
filepath : path-like
Path to Cary output file (.csv).
parent : WrightTools.Collection
A collection object in which to place a collection of Data objects.
Expand All @@ -55,19 +56,22 @@ def from_Cary(filepath, name=None, parent=None, verbose=True):
New data object.
"""
# check filepath
filesuffix = pathlib.Path(filepath).suffix
if filesuffix != ".csv":
filestr = os.fspath(filepath)
filepath = pathlib.Path(filepath)

if ".csv" not in filepath.suffixes:
wt_exceptions.WrongFileTypeWarning.warn(filepath, "csv")
if name is None:
name = "cary"
# import array
lines = []
with open(str(filepath), "r", encoding="iso-8859-1") as f:
ds = np.DataSource(None)
with ds.open(filestr, "rt", encoding="iso-8859-1") as f:
header = f.readline()
columns = f.readline()
while True:
line = f.readline()
if line == "\n" or line == "":
if line == "\n" or line == "" or line == "\r\n":
break
else:
# Note, it is necessary to call this twice, as a single call will
Expand Down Expand Up @@ -97,7 +101,7 @@ def from_Cary(filepath, name=None, parent=None, verbose=True):
name = "{}_{:03d}".format(header[i], i // 2)
else:
name = header[i]
dat = datas.create_data(name, kind="Cary", source=filepath)
dat = datas.create_data(name, kind="Cary", source=filestr)
dat.create_variable(ax, arr[i][~np.isnan(arr[i])], units=units)
dat.create_channel(
columns[i + 1].lower(), arr[i + 1][~np.isnan(arr[i + 1])], label=columns[i + 1].lower()
Expand Down
21 changes: 11 additions & 10 deletions WrightTools/collection/_directory.py
Expand Up @@ -6,8 +6,8 @@

import fnmatch
import queue
import pathlib
import os
import posixpath

from ._collection import Collection

Expand All @@ -26,7 +26,7 @@ def from_directory(filepath, from_methods, *, name=None, parent=None, verbose=Tr
Parameters
----------
filepath: str
filepath: path-like
Path to the directory on the file system
from_methods: dict<str, callable>
Dictionary which maps patterns (using Unix-like glob wildcard patterns)
Expand Down Expand Up @@ -56,8 +56,9 @@ def from_directory(filepath, from_methods, *, name=None, parent=None, verbose=Tr
... }
>>> col = wt.collection.from_directory('path/to/folder', from_dict)
"""
filepath = pathlib.Path(filepath).resolve()
if name is None:
name = os.path.basename(os.path.abspath(filepath))
name = filepath.name

if verbose:
print("Creating Collection:", name)
Expand All @@ -66,26 +67,26 @@ def from_directory(filepath, from_methods, *, name=None, parent=None, verbose=Tr

q = queue.Queue()

for i in os.listdir(filepath):
q.put((filepath, i, root))
for i in filepath.iterdir():
q.put((filepath, i.name, root))

while not q.empty():
path, fname, parent = q.get()
for pattern, func in from_methods.items():
if fnmatch.fnmatch(fname, pattern):
if func is not None:
func(
os.path.join(path, fname),
path / fname,
name=os.path.splitext(fname)[0],
parent=parent,
verbose=verbose,
)
break
else:
if os.path.isdir(os.path.join(path, fname)):
if (path / fname).is_dir():
if verbose:
print("Creating Collection at", posixpath.join(parent.name, fname))
print("Creating Collection at", pathlib.PurePosixPath(parent.name) / fname)
col = parent.create_collection(name=fname)
for i in os.listdir(os.path.join(path, fname)):
q.put((os.path.join(path, fname), i, col))
for i in (path / fname).iterdir():
q.put((path / fname, i.name, col))
return root
21 changes: 15 additions & 6 deletions WrightTools/data/_brunold.py
Expand Up @@ -5,6 +5,7 @@


import os
import pathlib

import numpy as np

Expand All @@ -28,8 +29,10 @@ def from_BrunoldrRaman(filepath, name=None, parent=None, verbose=True) -> Data:
Parameters
----------
filepath : string, list of strings, or array of strings
filepath : path-like
Path to .txt file.
Can be either a local or remote file (http/ftp).
Can be compressed with gz/bz2, decompression based on file name.
name : string (optional)
Name to give to the created data object. If None, filename is used.
Default is None.
Expand All @@ -44,19 +47,25 @@ def from_BrunoldrRaman(filepath, name=None, parent=None, verbose=True) -> Data:
New data object(s).
"""
# parse filepath
if not filepath.endswith("txt"):
wt_exceptions.WrongFileTypeWarning.warn(filepath, "txt")
filestr = os.fspath(filepath)
filepath = pathlib.Path(filepath)

if not ".txt" in filepath.suffixes:
wt_exceptions.WrongFileTypeWarning.warn(filepath, ".txt")
# parse name
if not name:
name = os.path.basename(filepath).split(".")[0]
name = filepath.name.split(".")[0]
# create data
kwargs = {"name": name, "kind": "BrunoldrRaman", "source": filepath}
kwargs = {"name": name, "kind": "BrunoldrRaman", "source": filestr}
if parent is None:
data = Data(**kwargs)
else:
data = parent.create_data(**kwargs)
# array
arr = np.genfromtxt(filepath, delimiter="\t").T
ds = np.DataSource(None)
f = ds.open(filestr, "rt")
arr = np.genfromtxt(f, delimiter="\t").T
f.close()
# chew through all scans
data.create_variable(name="energy", values=arr[0], units="wn")
data.create_channel(name="signal", values=arr[1])
Expand Down
29 changes: 20 additions & 9 deletions WrightTools/data/_colors.py
Expand Up @@ -5,7 +5,7 @@


import os

import pathlib
import collections

import numpy as np
Expand Down Expand Up @@ -38,8 +38,10 @@ def from_COLORS(
Parameters
----------
filepaths : string or list of strings
filepaths : path-like or list of path-like
Filepath(s).
Can be either a local or remote file (http/ftp).
Can be compressed with gz/bz2, decompression based on file name.
name : string (optional)
Unique dataset identifier. If None (default), autogenerated.
cols : {'v0', 'v1', 'v2'} (optional)
Expand All @@ -60,15 +62,19 @@ def from_COLORS(
"""
# do we have a list of files or just one file? ------------------------------------------------
if isinstance(filepaths, list):
file_example = filepaths[0]
filestrs = [os.fspath(f) for f in filepaths]
filepaths = [pathlib.Path(f) for f in filepaths]
else:
file_example = filepaths
filepaths = [filepaths]
filestrs = [os.fspath(filepaths)]
filepaths = [pathlib.Path(filepaths)]
ds = np.DataSource(None)
# define format of dat file -------------------------------------------------------------------
if cols:
pass
else:
num_cols = len(np.genfromtxt(file_example).T)
f = ds.open(filestrs[0], "rt")
num_cols = len(np.genfromtxt(f).T)
f.close()
if num_cols in [28, 35, 41]:
cols = "v2"
elif num_cols in [20]:
Expand Down Expand Up @@ -126,7 +132,12 @@ def from_COLORS(
channels["ai2"] = {"idx": 12, "label": "2"}
channels["ai3"] = {"idx": 13, "label": "3"}
# import full array ---------------------------------------------------------------------------
arr = np.concatenate([np.genfromtxt(f).T for f in filepaths], axis=1)
arr = []
for f in filestrs:
ff = ds.open(f, "rt")
arr.append(np.genfromtxt(ff).T)
ff.close()
arr = np.concatenate(arr, axis=1)
if invert_d1:
idx = axes["d1"]["idx"]
arr[idx] = -arr[idx]
Expand All @@ -138,8 +149,8 @@ def from_COLORS(
scanned = wt_kit.discover_dimensions(arr, axes_discover)
# create data object --------------------------------------------------------------------------
if name is None:
name = wt_kit.string2identifier(os.path.basename(filepaths[0]))
kwargs = {"name": name, "kind": "COLORS", "source": filepaths}
name = wt_kit.string2identifier(filepaths[0].name)
kwargs = {"name": name, "kind": "COLORS", "source": filestrs}
if parent is not None:
data = parent.create_data(**kwargs)
else:
Expand Down
22 changes: 16 additions & 6 deletions WrightTools/data/_jasco.py
Expand Up @@ -5,6 +5,7 @@


import os
import pathlib

import numpy as np

Expand All @@ -26,8 +27,10 @@ def from_JASCO(filepath, name=None, parent=None, verbose=True) -> Data:
Parameters
----------
filepath : string, list of strings, or array of strings
filepath : path-like
Path to .txt file.
Can be either a local or remote file (http/ftp).
Can be compressed with gz/bz2, decompression based on file name.
name : string (optional)
Name to give to the created data object. If None, filename is used.
Default is None.
Expand All @@ -42,19 +45,26 @@ def from_JASCO(filepath, name=None, parent=None, verbose=True) -> Data:
New data object(s).
"""
# parse filepath
if not filepath.endswith("txt"):
wt_exceptions.WrongFileTypeWarning.warn(filepath, "txt")
filestr = os.fspath(filepath)
filepath = pathlib.Path(filepath)

if not ".txt" in filepath.suffixes:
wt_exceptions.WrongFileTypeWarning.warn(filepath, ".txt")
# parse name
if not name:
name = os.path.basename(filepath).split(".")[0]
name = filepath.name.split(".")[0]
# create data
kwargs = {"name": name, "kind": "JASCO", "source": filepath}
kwargs = {"name": name, "kind": "JASCO", "source": filestr}
if parent is None:
data = Data(**kwargs)
else:
data = parent.create_data(**kwargs)
# array
arr = np.genfromtxt(filepath, skip_header=18).T
ds = np.DataSource(None)
f = ds.open(filestr, "rt")
arr = np.genfromtxt(f, skip_header=18).T
f.close()

# chew through all scans
data.create_variable(name="energy", values=arr[0], units="nm")
data.create_channel(name="signal", values=arr[1])
Expand Down

0 comments on commit 86ebafa

Please sign in to comment.