Skip to content

Commit

Permalink
add Saver class
Browse files Browse the repository at this point in the history
  • Loading branch information
TsumiNa committed Jan 21, 2018
1 parent df82c8b commit c4ba486
Show file tree
Hide file tree
Showing 3 changed files with 156 additions and 10 deletions.
4 changes: 3 additions & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,9 @@ before_script:
fi

# install miniconda
install: source travis/install.sh
install:
- source travis/install.sh
- pip install -e .

script:
- make unittest
Expand Down
26 changes: 24 additions & 2 deletions samples/data_loader.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,29 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"/Users/liuchang/projects/XenonPy/LICENSE\n/Users/liuchang/projects/XenonPy/MANIFEST.in\n/Users/liuchang/projects/XenonPy/Makefile\n/Users/liuchang/projects/XenonPy/README.md\n/Users/liuchang/projects/XenonPy/environment.yml\n/Users/liuchang/projects/XenonPy/environment_py35.yml\n/Users/liuchang/projects/XenonPy/licences\n/Users/liuchang/projects/XenonPy/readthedocs.yml\n/Users/liuchang/projects/XenonPy/docs\n/Users/liuchang/projects/XenonPy/requirements.txt\n/Users/liuchang/projects/XenonPy/requirements_test.txt\n/Users/liuchang/projects/XenonPy/setup.cfg\n/Users/liuchang/projects/XenonPy/setup.py\n/Users/liuchang/projects/XenonPy/tests\n/Users/liuchang/projects/XenonPy/travis\n/Users/liuchang/projects/XenonPy/xenonpy\n/Users/liuchang/projects/XenonPy/xenonpy.egg-info\n/Users/liuchang/projects/XenonPy/samples\n"
]
}
],
"source": [
"from xenonpy.utils.datatools import Saver\n",
"from pathlib import Path\n",
"\n",
"save = Saver(Path().cwd())\n",
"for f in save:\n",
" print(f)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
Expand All @@ -70,7 +92,7 @@
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-1-5e940c08cc0e>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mxenonpy\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mload\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mele\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'elements_complete'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mele\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minfo\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m<ipython-input-4-5e940c08cc0e>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mxenonpy\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mload\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mele\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'elements_complete'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mele\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minfo\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mImportError\u001b[0m: cannot import name 'load'"
],
"output_type": "error"
Expand Down
136 changes: 129 additions & 7 deletions xenonpy/utils/datatools.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,18 @@
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file.

from collections.abc import Iterator
from datetime import datetime as dt
from os import remove
from os.path import getmtime
from pathlib import Path
from shutil import rmtree

import pandas as pd
import urllib3
from sklearn.externals import joblib as jl
from urllib3.exceptions import HTTPError

from .. import Path
from .. import __pkg_cfg_root__
from .. import _get_dataset_url

Expand Down Expand Up @@ -115,6 +122,7 @@ def __init__(self, path: str = None, *,
self.cached_dir = Path().home() / __pkg_cfg_root__ / 'cached'

def _fetch_data(self, url: str, save_to):

http = urllib3.PoolManager()

# fetch `name.pkl` file
Expand Down Expand Up @@ -145,15 +153,11 @@ def __call__(self, name: str, include=None, exclude=None):
-----------
name: str
name of dateset.
include: str-list
filter which columns should be included.
exclude: str-list
filter which columns should drop out.
Returns
------
pandas.DataFrame
return loaded data in panda.DataFrame object.
DataFrame or Saver or local file path.
return loaded data in DataFrame object.
"""

# check dataset name
Expand Down Expand Up @@ -226,3 +230,121 @@ def elements_completed(self):
imputed element properties in pd.DataFrame
"""
return self('elements_completed')


class Saver(Iterator):
"""
Save data in a convenient way::
.. code: pytho
import numpy as np
np.random.seed(0)
# some data
data1 = np.random.randn(5, 5)
data2 = np.random.randint(5, 5)
# init Saver
save = Saver('you_dataset_name')
# save data
save(data1, data2)
# retriever data
date = save.last() # last saved
data = save[0] # by index
for data in save: # as iterator
do_something(data)
# delete data
save.delete(0) # by index
save.delete() # delete 'you_dataset_name'' dir
"""

def __init__(self, path=None, suffix=None):
"""
Parameters
----------
path: str or pathlib.Path
The dir to save and load data
"""
self._it = 0 # for iterator

self.path = Path(path)
self.suffix = suffix
self.files = None
self._make_file_index()

def _make_file_index(self):
self.files = [f for f in self.path.iterdir() if not f.match('.*')]
if self.suffix:
self.files = [f for f in self.files if f.suffix == self.suffix]
self.files.sort(key=lambda f: getmtime(str(f)))

def last(self):
"""
Return last saved data.
``joblib.load()`` will be used.
Returns
-------
result: any Python object
Data load from '*.pkl' file.
"""
return jl.load(self.files[-1])

def delete(self, item=None):
"""
Delete file(s) under given dir.
Parameters
----------
item: int or slice
Item index will be deleted.
If `None`, given dir will be deleted.
-------
"""
if not item:
rmtree(str(self.path))
return

del_files = self.files[item]
for f in del_files:
remove(f)
self._make_file_index()

def __getitem__(self, item):
""""""
files = self.files[item]
if not isinstance(files, list):
return jl.load(files)
return [jl.load(f) for f in files]

def __iter__(self):
self._it = 0
return self

def __next__(self):
try:
file = self.files[self._it]
self._it += 1
return jl.load(file)
except IndexError:
raise StopIteration

def __call__(self, *data):
""""""
for d in data:
file_name = dt.now().strftime('%Y-%m-%d_%H-%M-%S_%f')
file_path = Path().home() / __pkg_cfg_root__ / 'cached' / self.path
if not file_path.exists():
file_path.mkdir()

if isinstance(d, pd.DataFrame):
file = file_path / file_name + '.pkl.pd_'
pd.to_pickle(d, str(file))
else:
file = file_path / file_name + '.pkl.z'
jl.dump(data, file)

0 comments on commit c4ba486

Please sign in to comment.