diff --git a/.bumpversion.cfg b/.bumpversion.cfg new file mode 100644 index 0000000..1fd76bb --- /dev/null +++ b/.bumpversion.cfg @@ -0,0 +1,26 @@ +[bumpversion] +current_version = 0.1.0 +commit = True +tag = False +parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? +serialize = + {major}.{minor}.{patch}.{release}{build} + {major}.{minor}.{patch} + +[bumpversion:part:release] +optional_value = rc +first_value = dev +values = + dev + rc + +[bumpversion:part:build] + +[bumpversion:file:setup.py] +search = version='{current_version}' +replace = version='{new_version}' + +[bumpversion:file:stockroom/__init__.py] +search = __version__ = '{current_version}' +replace = __version__ = '{new_version}' + diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 0000000..2c48baf --- /dev/null +++ b/.coveragerc @@ -0,0 +1,18 @@ +[paths] +source = stockroom + +[run] +branch = True +source = + stockroom + tests +parallel = True + +[report] +exclude_lines = + pragma: no cover + def __repr__ + def _repr_pretty_ + def _ipython_key_completions_ +show_missing = True +precision = 2 diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md new file mode 100644 index 0000000..eb7fcd4 --- /dev/null +++ b/.github/ISSUE_TEMPLATE.md @@ -0,0 +1,15 @@ +* stockroom version: +* Python version: +* Operating System: + +### Description + +Describe what you were trying to get done. +Tell us what happened, what went wrong, and what you expected to happen. + +### What I Did + +``` +Paste the command(s) you ran and the output. +If there was a crash, please include the traceback here. +``` diff --git a/.gitignore b/.gitignore index f1f5f0d..f2c0f99 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,7 @@ __pycache__/ # Distribution / packaging .Python +env/ build/ develop-eggs/ dist/ @@ -23,7 +24,6 @@ wheels/ *.egg-info/ .installed.cfg *.egg -MANIFEST # PyInstaller # Usually these files are written by a python script from a template @@ -54,7 +54,6 @@ coverage.xml # Django stuff: *.log local_settings.py -db.sqlite3 # Flask stuff: instance/ @@ -81,14 +80,13 @@ celerybeat-schedule # SageMath parsed files *.sage.py -# Environments +# dotenv .env + +# virtualenv .venv -env/ venv/ ENV/ -env.bak/ -venv.bak/ # Spyder project settings .spyderproject @@ -103,7 +101,13 @@ venv.bak/ # mypy .mypy_cache/ -# pycharm -.idea +# IDE settings +.vscode/ +.idea/ -.hangar +# leftover from examples +*/.hangar/ +*/.ipynb_checkpoints/ +examples/head.stock +examples/.gitignore +docs/*.ipynb diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..fe17f17 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,152 @@ +matrix: + include: + # ====================== Cover 3.7 ====================== + - name: "Linux Python 3.7: Run tests" + os: linux + dist: xenial + language: python + python: '3.7' + cache: pip + env: + - LD_PRELOAD=/lib/x86_64-linux-gnu/libSegFault.so + - SEGFAULT_SIGNALS=all + - TOXENV=py37,report + after_success: + - codecov + + - name: "Win Python 3.7: Run tests" + os: windows + language: shell + env: + - PATH=/c/Python37:/c/Python37/Scripts:$PATH + - TOXENV=py37,report + before_install: + - choco install python --version 3.7.5 + - python -m pip install --upgrade pip + after_success: + - codecov + + - name: "OSX Python 3.7: Run tests" + os: osx + osx_image: xcode11.2 + language: generic + env: + - TR_PYTHON_VERSION=3.7.5 + - PYENV_VERSION=3.7.5 + - TOXENV=py37,report + addons: + homebrew: + packages: + - openssl + - readline + - xz + after_success: + - codecov + cache: + - pip + - directories: + - $HOME/.cache/pip + - $HOME/.cache/pyenv + + # ===================== Cover 3.6 ======================= + - name: "Linux Python 3.6: Run tests" + os: linux + dist: xenial + language: python + python: '3.6' + cache: pip + env: + - LD_PRELOAD=/lib/x86_64-linux-gnu/libSegFault.so + - SEGFAULT_SIGNALS=all + - TOXENV=py36,report + after_success: + - codecov + + - name: "Win Python 3.6: Run tests" + os: windows + language: shell + env: + - PATH=/c/Python36:/c/Python36/Scripts:$PATH + - TOXENV=py36,report + before_install: + - choco install python --version 3.6.8 + - python -m pip install --upgrade pip + after_success: + - codecov + + - name: "OSX Python 3.6: Run tests" + os: osx + osx_image: xcode11.2 + language: generic + env: + - TR_PYTHON_VERSION=3.6.8 + - PYENV_VERSION=3.6.8 + - TOXENV=py36,report + addons: + homebrew: + packages: + - openssl + - readline + - xz + after_success: + - codecov + cache: + - pip + - directories: + - $HOME/.cache/pip + - $HOME/.cache/pyenv + + # ======================= Build Doc ===================== + - name: "Build Docs" + os: linux + dist: xenial + language: python + python: '3.7' + cache: pip + env: + - LD_PRELOAD=/lib/x86_64-linux-gnu/libSegFault.so + - SEGFAULT_SIGNALS=all + - TOXENV=docs + addons: + apt_packages: + - pandoc + +before_install: + - | + if [ "$TRAVIS_OS_NAME" == "osx" ]; then + unset PYENV_ROOT + mkdir -p ~/.cache/pyenv/versions + curl -L https://github.com/pyenv/pyenv-installer/raw/master/bin/pyenv-installer | bash + which pyenv + ln -s ~/.cache/pyenv/versions ~/.pyenv/versions + export PATH="$HOME/.pyenv/bin:$PATH" + pyenv install --skip-existing $TR_PYTHON_VERSION + eval "$(pyenv init -)" + pyenv global $TR_PYTHON_VERSION + # A manual check that the correct version of Python is running. + python --version + python -m pip install -U pip setuptools wheel + fi + - python --version + - uname -a + - if [ "$TRAVIS_OS_NAME" == "linux" ]; then lsb_release -a; fi + +install: + - pip install tox + - pip install codecov + - virtualenv --version + - easy_install --version + - pip --version + - tox --version + +script: + - tox -v + +after_failure: + - more .tox/log/* | cat + - more .tox/*/log/* | cat + +notifications: + email: + on_success: never + on_failure: never diff --git a/AUTHORS.rst b/AUTHORS.rst new file mode 100644 index 0000000..a4e1ff7 --- /dev/null +++ b/AUTHORS.rst @@ -0,0 +1,4 @@ +Authors +======= + +* Sherin Thomas diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst new file mode 100644 index 0000000..2bb0e54 --- /dev/null +++ b/CONTRIBUTING.rst @@ -0,0 +1,106 @@ +.. highlight:: shell + +============ +Contributing +============ + +Contributions are welcome, and they are greatly appreciated! Every little bit +helps, and credit will always be given. + +You can contribute in many ways: + +Types of Contributions +---------------------- + +Report Bugs +~~~~~~~~~~~ + +Report bugs at https://github.com/tensorwerk/stockroom/issues. + +If you are reporting a bug, please include: + +* Your operating system name and version. +* Any details about your local setup that might be helpful in troubleshooting. +* Detailed steps to reproduce the bug. + +Fix Bugs +~~~~~~~~ + +Look through the GitHub issues for bugs. Anything tagged with "bug" and "help +wanted" is open to whoever wants to implement it. + +Implement Features +~~~~~~~~~~~~~~~~~~ + +Look through the GitHub issues for features. Anything tagged with "enhancement" +and "help wanted" is open to whoever wants to implement it. + +Write Documentation +~~~~~~~~~~~~~~~~~~~ + +stockroom could always use more documentation, whether as part of the +official stockroom docs, in docstrings, or even on the web in blog posts, +articles, and such. + +Submit Feedback +~~~~~~~~~~~~~~~ + +The best way to send feedback is to file an issue at https://github.com/tensorwerk/stockroom/issues. + +If you are proposing a feature: + +* Explain in detail how it would work. +* Keep the scope as narrow as possible, to make it easier to implement. +* Remember that this is a volunteer-driven project, and that contributions + are welcome :) + +Get Started! +------------ + +Ready to contribute? Here's how to set up `stockroom` for local development. + +1. Fork the `stockroom` repo on GitHub. +2. Clone your fork locally:: + + $ git clone git@github.com:your_name_here/stockroom.git + +3. Install your local copy into a virtualenv. Assuming you have virtualenvwrapper installed, this is how you set up your fork for local development:: + + $ mkvirtualenv stockroom + $ cd stockroom/ + $ python setup.py develop + +4. Create a branch for local development:: + + $ git checkout -b name-of-your-bugfix-or-feature + + Now you can make your changes locally. + +5. When you're done making changes, check that your changes pass flake8 and the + tests, including testing other Python versions with tox:: + + $ flake8 stockroom tests + $ tox + + To get flake8 and tox, just pip install them into your virtualenv. + +6. Commit your changes and push your branch to GitHub:: + + $ git add . + $ git commit -m "Your detailed description of your changes." + $ git push origin name-of-your-bugfix-or-feature + +7. Submit a pull request through the GitHub website. + +Pull Request Guidelines +----------------------- + +Before you submit a pull request, check that it meets these guidelines: + +1. The pull request should include tests. +2. If the pull request adds functionality, the docs should be updated. Put + your new functionality into a function with a docstring, and add the + feature to the list in README.rst. +3. The pull request should work for Python 3.6, 3.7 and 3.8. Check + https://travis-ci.org/tensorwerk/stockroom/pull_requests + and make sure that the tests pass for all supported Python versions. diff --git a/HISTORY.rst b/HISTORY.rst new file mode 100644 index 0000000..97bbb9e --- /dev/null +++ b/HISTORY.rst @@ -0,0 +1,8 @@ +======= +History +======= + +0.1.0 (2019-12-12) +------------------ + +* First release on PyPI. diff --git a/LICENSE b/LICENSE index 261eeb9..93e7392 100644 --- a/LICENSE +++ b/LICENSE @@ -1,201 +1,16 @@ - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ +Apache Software License 2.0 - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION +Copyright (c) 2019, Tensorwerk Inc, Sherin Thomas - 1. Definitions. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. +http://www.apache.org/licenses/LICENSE-2.0 - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..965b2dd --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,11 @@ +include AUTHORS.rst +include CONTRIBUTING.rst +include HISTORY.rst +include LICENSE +include README.rst + +recursive-include tests * +recursive-exclude * __pycache__ +recursive-exclude * *.py[co] + +recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif diff --git a/README.md b/README.md deleted file mode 100644 index aeb42b8..0000000 --- a/README.md +++ /dev/null @@ -1,3 +0,0 @@ -# StockRoom -A platform to version models, data, parameters, metrics etc alongside git versioned source code. -Althouh it is built as a high level API kit for [hangar](https://github.com/tensorwerk/hangar-py) and comes as part of hangar itself, user doesn't need to know any founding philosophy of hangar work with stockroom unless you need fine grained control diff --git a/README.rst b/README.rst new file mode 100644 index 0000000..17627eb --- /dev/null +++ b/README.rst @@ -0,0 +1,64 @@ +========= +Stockroom +========= + + +.. image:: https://img.shields.io/pypi/v/stockroom.svg + :target: https://pypi.python.org/pypi/stockroom + +.. image:: https://img.shields.io/travis/hhsecond/stockroom.svg + :target: https://travis-ci.org/hhsecond/stockroom + +.. image:: https://readthedocs.org/projects/stockroom/badge/?version=latest + :target: https://stockroom.readthedocs.io/en/latest/?badge=latest + :alt: Documentation Status + + + +Stockroom is a platform to version models, data, parameters, metrics etc. alongside git +versioned source code. It is licensed as a Free software under +**Apache Software License 2.0** + +Introduction +------------ +Stockroom is built on top of `hangar `_ and hence +is high performant but with a minimal and simple set of APIs. We tried to reduce the +cognitive overload as much as possible for a new user to get started. So stockroom is not +exactly a wrapper that blinds you from the internal hangar philosophy but it's a tool +that enables you to do things easy. + +Stockroom is currently in it's first ever release. It doesn't have an exhaustive test +suite and the APIs could change in backward incompatible way (for good). + +Why +--- +Stockroom exists for three reasons + +- Work hand-in-hand with git: + +Stockroom let git does ``checkout`` and rely on that to move between branches/commits. +This allows stockroom to present a very simple and intuitive API collection to the user +while avoiding the need of user learning another set of commands otherwise they'd need. + +- Simplify `hangar `_ APIs: + +Hangar is an extensive and reliable tool that allows users to have fine grained control +for storing and versioning data without compromising the speed and efficiency. +Essentially trying to do what git did for source code but for data. But the set of APIs +hangar provides are also extensive and can be cut short if we can delegate few tasks to +git and make certain assumptions. And that's exactly what stockroom does + +- Make storage of model + data + params + metrics and versioning them possible in `hangar `_ + + + +Example +======= +.. code-block:: python + + from stockroom import StockRoom + import numpy as np + + stock = StockRoom() + stock.data['sample1'] = np.random.random((3, 28, 28)) + diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..382ccd3 --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = python -msphinx +SPHINXPROJ = stockroom +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/api.rst b/docs/api.rst new file mode 100644 index 0000000..0400ea3 --- /dev/null +++ b/docs/api.rst @@ -0,0 +1,34 @@ +.. _ref-api: + +API Documentation +================= +Remember: One of the motive behind the existence of stockroom is the simplicity and +that's what we have considered whenever we added or removed a new API. What does that +mean? It means that we have tried hard to keep the number of APIs to a minimum +while catering the requirements of a developer. Here we discuss the python APIs +available in stockroom. + +Initialization +-------------- +.. automodule:: stockroom.init_repo() + :members: + +StockRoom class +--------------- +.. autoclass:: stockroom.StockRoom() + :members: + +Storages +-------- +Stockroom introduces three different storages for different storage needs and all the +APIs in stockroom is to deal with these storages + + +.. autoclass:: stockroom.storages.Data() + :members: + +.. autoclass:: stockroom.storages.Model() + :members: + +.. autoclass:: stockroom.storages.Tag() + :members: diff --git a/docs/authors.rst b/docs/authors.rst new file mode 100644 index 0000000..e122f91 --- /dev/null +++ b/docs/authors.rst @@ -0,0 +1 @@ +.. include:: ../AUTHORS.rst diff --git a/docs/changelog.rst b/docs/changelog.rst new file mode 100644 index 0000000..2506499 --- /dev/null +++ b/docs/changelog.rst @@ -0,0 +1 @@ +.. include:: ../HISTORY.rst diff --git a/docs/cli.rst b/docs/cli.rst new file mode 100644 index 0000000..8905584 --- /dev/null +++ b/docs/cli.rst @@ -0,0 +1,14 @@ +.. _ref-cli: + +The stock CLI +============= + +Stockroom introduces a high level, minimalistic command line utility called ``stock``. +It is built to be analogous to what ``hangar`` or ``git`` brings as functionalities +through it's CLIs. + +Hit ``$ stock --help`` in your terminal when you stuck! + +.. click:: stockroom.cli:main + :prog: stock + :show-nested: diff --git a/docs/conf.py b/docs/conf.py new file mode 100755 index 0000000..3ae1f49 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,111 @@ +#!/usr/bin/env python +# +# stockroom documentation build configuration file, created by +# sphinx-quickstart on Fri Jun 9 13:47:02 2017. +# +# This file is execfile()d with the current directory set to its +# containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +# If extensions (or modules to document with autodoc) are in another +# directory, add these directories to sys.path here. If the directory is +# relative to the documentation root, use os.path.abspath to make it +# absolute, like shown here. +# +import os +import sys + +from pathlib import Path +import shutil + +for file in Path('../examples').iterdir(): + if file.suffix == '.ipynb': + shutil.copy(file, '.') + + +sys.path.insert(0, os.path.abspath('..')) + +import stockroom + + +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.viewcode', + 'sphinx.ext.napoleon', + 'nbsphinx', + 'sphinx_click.ext', +] +nbsphinx_execute = 'never' + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# +# source_suffix = ['.rst', '.md'] +source_suffix = '.rst' + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = 'stockroom' +copyright = "2019-2020 Tensorwerk Inc, Sherin Thomas" +author = "Sherin Thomas" + +# The version info for the project you're documenting, acts as replacement +# for |version| and |release|, also used in various other places throughout +# the built documents. +# +# The short X.Y version. +version = stockroom.__version__ +# The full version, including alpha/beta/rc tags. +release = stockroom.__version__ + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = None + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This patterns also effect to html_static_path and html_extra_path +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# If true, `todo` and `todoList` produce output, else they produce nothing. +todo_include_todos = False + + +# -- Options for HTML output ------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = 'sphinx_rtd_theme' +html_use_smartypants = True +html_last_updated_fmt = '%b %d, %Y' +html_split_index = False +html_sidebars = { + '**': ['searchbox.html', 'globaltoc.html', 'sourcelink.html'], +} +html_short_title = '%s-%s' % (project, version) + +napoleon_use_ivar = True +napoleon_use_rtype = True +napoleon_use_param = True +napoleon_include_init_with_doc = True + +add_module_names = False +doctest_test_doctest_blocks = None +autoclass_content = 'class' diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 0000000..69f9fad --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,20 @@ +Welcome to stockroom's documentation! +====================================== + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + + readme + installation + api + cli + tutorial + authors + changelog + +Indices and tables +================== +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` diff --git a/docs/installation.rst b/docs/installation.rst new file mode 100644 index 0000000..9b64903 --- /dev/null +++ b/docs/installation.rst @@ -0,0 +1,31 @@ +============ +Installation +============ + +To install stockroom, run this command in your terminal: + +.. code-block:: console + + $ pip install stockroom + +This is the preferred method to install stockroom, as it will always install the most +recent stable release. + +The source for stockroom can be downloaded from the `Github repo`_. Below code block +shows how to install stockroom from source. + +.. code-block:: console + + $ git clone git://github.com/hhsecond/stockroom + $ cd stockroom + $ python setup.py install + +But if you are planning to contribute to stockroom, you might want to install stockroom +in development mode which enables you to test the changes you make easily. Installing +in development mode is possible with a slight tweak in the above commands + +.. code-block:: console + + $ python setup.py develop + +.. _Github repo: https://github.com/hhsecond/stockroom diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000..8fa6c48 --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,36 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=python -msphinx +) +set SOURCEDIR=. +set BUILDDIR=_build +set SPHINXPROJ=stockroom + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The Sphinx module was not found. Make sure you have Sphinx installed, + echo.then set the SPHINXBUILD environment variable to point to the full + echo.path of the 'sphinx-build' executable. Alternatively you may add the + echo.Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% + +:end +popd diff --git a/docs/readme.rst b/docs/readme.rst new file mode 100644 index 0000000..72a3355 --- /dev/null +++ b/docs/readme.rst @@ -0,0 +1 @@ +.. include:: ../README.rst diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 0000000..a3bd1f8 --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1,5 @@ +sphinx>=1.3 +sphinx-click +nbsphinx +sphinx_rtd_theme +-e . diff --git a/docs/tutorial.rst b/docs/tutorial.rst new file mode 100644 index 0000000..f2384ae --- /dev/null +++ b/docs/tutorial.rst @@ -0,0 +1,11 @@ +.. _ref-tutorial: + +######## +Tutorial +######## + +.. toctree:: + :maxdepth: 2 + :titlesonly: + + with-git diff --git a/examples/requirements.txt b/examples/requirements.txt new file mode 100644 index 0000000..95e3318 --- /dev/null +++ b/examples/requirements.txt @@ -0,0 +1,6 @@ +jupyter +notebook +jupyterlab +stockroom +python-mnist +PILLOW diff --git a/examples/with-git.ipynb b/examples/with-git.ipynb new file mode 100644 index 0000000..83bb005 --- /dev/null +++ b/examples/with-git.ipynb @@ -0,0 +1,616 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Alongside Git\n", + "Stockroom is built to use alongside git. This tutorial will guide you through a typical git workflow that uses stockroom to\n", + "- Store data\n", + "- Use that data to train a network in PyTorch\n", + "- Version the model as we go\n", + "- Tag the hyper parameters in different experiments\n", + "\n", + "For this tutorial, we use a pretrained PyTorch network to classify cats and dogs. We have divided the whole tutorial into 7 stages.\n", + "1. Setup the repository\n", + "2. Download some data and store it in stockroom\n", + "3. Train the network and save the model + hyper parameters\n", + "4. Fine tune the hyper parameters" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Setup the repository\n", + "In a typical software development project, we'll have a git repository ready. Let's make that first.\n", + "\n", + "#### Initialize git" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Initialized empty Git repository in /home/hhsecond/mypro/stockroom/examples/.git/\n" + ] + } + ], + "source": [ + "!git init" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Initialize stock\n", + "We need to initialize stock repository at the same location. A stock initialization is essentially a hangar initialization (if hangar repo doesn't exist at the given location) and creating `head.stock` file" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Hangar Repo initialized at: /home/hhsecond/mypro/stockroom/examples/.hangar\n", + "Stock file created\n" + ] + } + ], + "source": [ + "!stock init --name sherin --email a@b.c" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Initial git commit\n", + "Now we need to make the first commit. Remember, we use this notebook for controlling this workflow tutorial. Versioning the notebook might not be a good idea in this case since each checkout will change the status of our notebook which hinder us from moving forward. But in a typical project workflow you require you to version everything" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "On branch master\n", + "\n", + "No commits yet\n", + "\n", + "Untracked files:\n", + " (use \"git add ...\" to include in what will be committed)\n", + "\n", + "\t\u001b[31m.gitignore\u001b[m\n", + "\t\u001b[31m.ipynb_checkpoints/\u001b[m\n", + "\t\u001b[31mhead.stock\u001b[m\n", + "\t\u001b[31mrequirements.txt\u001b[m\n", + "\t\u001b[31mwith-git.ipynb\u001b[m\n", + "\n", + "nothing added to commit but untracked files present (use \"git add\" to track)\n" + ] + } + ], + "source": [ + "!git status" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[master (root-commit) bfe72a8] initialized repo\n", + " 2 files changed, 2 insertions(+)\n", + " create mode 100644 .gitignore\n", + " create mode 100644 head.stock\n" + ] + } + ], + "source": [ + "!echo \"\\ndownloads\" > .gitignore\n", + "!git add .gitignore head.stock\n", + "!git commit -m 'initialized repo'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Download & Store Data\n", + "For this tutorial, as most of the tutorials, we'll build a fully connected network to predict hand written digits from MNIST dataset.\n", + "\n", + "#### Download images\n", + "We download the data using below utility functions (inspired from https://gist.github.com/goldsborough/6dd52a5e01ed73a642c1e772084bcd03)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "from urllib.request import urlretrieve\n", + "import gzip\n", + "import os\n", + "import sys\n", + "\n", + "\n", + "def report_download_progress(chunk_number, chunk_size, file_size):\n", + " if file_size != -1:\n", + " percent = min(1, (chunk_number * chunk_size) / file_size)\n", + " bar = '#' * int(64 * percent)\n", + " sys.stdout.write('\\r0% |{:<64}| {}%'.format(bar, int(percent * 100)))\n", + "\n", + "\n", + "def download(destination_path, url):\n", + " if os.path.exists(destination_path):\n", + " print('{} already exists, skipping ...'.format(destination_path))\n", + " else:\n", + " print('Downloading {} ...'.format(url))\n", + " urlretrieve(url, destination_path, reporthook=report_download_progress)\n", + "\n", + "def unzip(zipped_path):\n", + " unzipped_path = os.path.splitext(zipped_path)[0]\n", + " if os.path.exists(unzipped_path):\n", + " print('{} already exists, skipping ... '.format(unzipped_path))\n", + " return\n", + " with gzip.open(zipped_path, 'rb') as zipped_file:\n", + " with open(unzipped_path, 'wb') as unzipped_file:\n", + " unzipped_file.write(zipped_file.read())\n", + " print('\\nUnzipped {} ...'.format(zipped_path))" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "downloads/train-images-idx3-ubyte.gz already exists, skipping ...\n", + "downloads/train-images-idx3-ubyte already exists, skipping ... \n", + "downloads/train-labels-idx1-ubyte.gz already exists, skipping ...\n", + "downloads/train-labels-idx1-ubyte already exists, skipping ... \n", + "downloads/t10k-images-idx3-ubyte.gz already exists, skipping ...\n", + "downloads/t10k-images-idx3-ubyte already exists, skipping ... \n", + "downloads/t10k-labels-idx1-ubyte.gz already exists, skipping ...\n", + "downloads/t10k-labels-idx1-ubyte already exists, skipping ... \n" + ] + } + ], + "source": [ + "from pathlib import Path\n", + "\n", + "RESOURCES = [\n", + " 'train-images-idx3-ubyte.gz',\n", + " 'train-labels-idx1-ubyte.gz',\n", + " 't10k-images-idx3-ubyte.gz',\n", + " 't10k-labels-idx1-ubyte.gz',\n", + "]\n", + "\n", + "path = Path('downloads')\n", + "path.mkdir(exist_ok=True)\n", + "\n", + "for resource in RESOURCES:\n", + " destination = os.path.join(str(path), resource)\n", + " url = 'http://yann.lecun.com/exdb/mnist/{}'.format(resource)\n", + " download(destination, url)\n", + " unzip(destination)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Store to StockRoom\n", + "We need hangar columns ready for stockroom to store data there. " + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Initialized Arrayset: image\n", + "Initialized Arrayset: label\n", + "Commit message:\n", + "arrayset initialized\n", + "Commit Successful. Digest: a=28a09ff56d69697bc313561b362200ae94b389d5\n" + ] + } + ], + "source": [ + "!hangar arrayset create image INT64 784\n", + "!hangar arrayset create label INT64 1\n", + "!stock commit -m 'arrayset initialized'" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "from mnist import MNIST\n", + "mndata = MNIST(path)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "images, labels = mndata.load_training()\n", + "tmpimages, tmplabels = mndata.load_testing()\n", + "images.extend(tmpimages)\n", + "labels.extend(tmplabels)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "from stockroom import StockRoom\n", + "stock = StockRoom()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " * Checking out COMMIT: a=28a09ff56d69697bc313561b362200ae94b389d5\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 70000/70000 [00:28<00:00, 2433.96it/s]\n" + ] + } + ], + "source": [ + "from tqdm import tqdm\n", + "import numpy as np\n", + "\n", + "with stock.optimize(write=True):\n", + " for i in tqdm(range(len(images))):\n", + " img = np.array(images[i])\n", + " label = np.array(labels[i]).reshape(1)\n", + " stock.data['image', i] = img\n", + " stock.data['label', i] = label" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Commit message:\n", + "added data\n", + "Commit Successful. Digest: a=d6b2e5d8bbc397eda5448b3eadc0dc39e14c123e\n" + ] + } + ], + "source": [ + "!stock commit -m 'added data'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Network training\n", + "Let's build a simple fully connected network in PyTorch" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "from tqdm import tqdm\n", + "import torch\n", + "from stockroom import StockRoom\n", + "\n", + "def train(model, optimizer, criterion):\n", + " stock = StockRoom()\n", + "\n", + " with stock.optimize():\n", + " for epoch in range(stock.tag['epoch']):\n", + " running_loss = 0\n", + " trange = tqdm(range(70000))\n", + " for i in trange:\n", + " optimizer.zero_grad()\n", + " sample = torch.from_numpy(stock.data['image', i]).float()\n", + " sample /= 255\n", + " out = model(sample).unsqueeze(0)\n", + " label = torch.from_numpy(stock.data['label', i])\n", + " loss = criterion(out, label)\n", + " running_loss += loss.item()\n", + " loss.backward()\n", + " optimizer.step()\n", + " if i % 1000 == 0 and i != 0:\n", + " trange.set_description(str(running_loss / i))\n", + " stock.model['mnist'] = model.state_dict()\n", + " stock.commit('added model')" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Sequential(\n", + " (0): Linear(in_features=784, out_features=32, bias=True)\n", + " (1): ReLU()\n", + " (2): Linear(in_features=32, out_features=16, bias=True)\n", + " (3): ReLU()\n", + " (4): Linear(in_features=16, out_features=10, bias=True)\n", + " (5): LogSoftmax()\n", + ")" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import torch.nn as nn\n", + "\n", + "stock.tag['lr'] = 0.01\n", + "stock.tag['momentum'] = 0.5\n", + "stock.tag['epoch'] = 2\n", + "stock.commit('hyper params')\n", + "\n", + "input_size = 784\n", + "hidden_sizes = [32, 16]\n", + "output_size = 10\n", + "\n", + "model = nn.Sequential(\n", + " nn.Linear(input_size, hidden_sizes[0]),\n", + " nn.ReLU(),\n", + " nn.Linear(hidden_sizes[0], hidden_sizes[1]),\n", + " nn.ReLU(),\n", + " nn.Linear(hidden_sizes[1], output_size),\n", + " nn.LogSoftmax())" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " * Checking out COMMIT: a=5c291a0b2d946e3bfa359f754837a112df575bd6\n", + " * Checking out COMMIT: a=5c291a0b2d946e3bfa359f754837a112df575bd6\n" + ] + } + ], + "source": [ + "from torch import optim\n", + "\n", + "optimizer = optim.SGD(model.parameters(), lr=stock.tag['lr'], momentum=stock.tag['momentum'])\n", + "criterion = nn.NLLLoss()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " * Checking out COMMIT: a=5c291a0b2d946e3bfa359f754837a112df575bd6\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 0%| | 0/70000 [00:00=7.0', 'hangar>=0.4.0'] + + +setup( + author="Sherin Thomas", + author_email='sherin@tensorwerk.com', + python_requires='>=3.6', classifiers=[ - "Programming Language :: Python :: 3", - "License :: OSI Approved :: MIT License", - "Operating System :: OS Independent", + # complete classifier list: http://pypi.python.org/pypi?%3Aaction=list_classifiers + 'Development Status :: 2 - Pre-Alpha', + 'Intended Audience :: Developers', + 'License :: OSI Approved :: Apache Software License', + 'Natural Language :: English', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.6', + 'Programming Language :: Python :: 3.7', ], - python_requires='>=3.6', + description="A hangar wrapper that enables the versioning of model, params " + "and metrics along with data", entry_points={ - 'console_scripts': ['stock = stockroom.cli:main'] + 'console_scripts': [ + 'stock=stockroom.cli:main', + ], }, - install_requires=['hangar'] + install_requires=requirements, + license="Apache Software License 2.0", + long_description=readme + '\n\n' + history, + include_package_data=True, + keywords='stockroom', + name='stockroom', + packages=find_packages(include=['stockroom', 'stockroom.*']), + url='https://github.com/tensorwerk/stockroom', + version='0.1.0', + zip_safe=False, ) diff --git a/stockroom/__init__.py b/stockroom/__init__.py index da6394b..136c998 100644 --- a/stockroom/__init__.py +++ b/stockroom/__init__.py @@ -1,10 +1,5 @@ -from stockroom.storages import DataStore -from stockroom.storages import ModelStore -from stockroom.storages import ParamStore -from stockroom.storages import MetricStore -from .repository import init, commit +from .main import StockRoom +from .repository import init_repo - -# TODO: Simplify APIs by not making users initiate a storage class each time - -__all__ = ['DataStore', 'ModelStore', 'ParamStore', 'MetricStore', 'init', 'commit'] +__all__ = ['StockRoom', 'init_repo'] +__version__ = '0.1.0' diff --git a/stockroom/cli.py b/stockroom/cli.py index c0d0574..9395308 100644 --- a/stockroom/cli.py +++ b/stockroom/cli.py @@ -1,71 +1,50 @@ -from pathlib import Path import click -from hangar import Repository -from . import repository - - -# TODO: move repetative code in hangar and here to a common function -pass_repo = click.make_pass_decorator(Repository, ensure=True) +from .repository import init_repo +from .main import StockRoom @click.group(no_args_is_help=True, add_help_option=True, invoke_without_command=True) -@click.pass_context -def main(ctx): - cwd = Path.cwd() - ctx.obj = Repository(path=cwd, exists=False) - - -@main.command() -@click.option('--message', '-m', multiple=True, - help=('The commit message. If provided multiple times ' - 'each argument gets converted into a new line.')) -@pass_repo -def commit(repo: Repository, message): - """Commits outstanding changes. - - Commit changes to the given files into the repository. You will need to - 'push' to push up your changes to other repositories. +def main(): """ - from hangar.records.summarize import status - if not message: - with repo.checkout(write=True) as co: - diff = co.diff.staged() - status_txt = status(co.branch_name, diff.diff) - status_txt.seek(0) - marker = '# Changes To Be committed: \n' - hint = ['\n', '\n', marker, '# \n'] - for line in status_txt.readlines(): - hint.append(f'# {line}') - # open default system editor - message = click.edit(''.join(hint)) - if message is None: - click.echo('Aborted!') - return - msg = message.split(marker)[0].rstrip() - if not msg: - click.echo('Aborted! Empty commit message') - return - # TODO: should be done in the __exit__ of hangar checkout - co.close() - else: - msg = '\n'.join(message) - click.echo('Commit message:\n' + msg) - try: - digest = repository.commit(message) - except (FileNotFoundError, RuntimeError) as e: - raise click.ClickException(e) - click.echo(f'Commit Successful. Digest: {digest}') + With ``stock`` we introduces a minimal set of commands which are necessary to run a + git + stockroom workflow. You will also be able to setup github hooks for few + ``stock`` actions in the upcoming release. + """ + pass @main.command() -@click.option('--name', prompt='User Name', help='first and last name of user') -@click.option('--email', prompt='User Email', help='email address of the user') +@click.option('--name', prompt='User Name', help='First and last name of user') +@click.option('--email', prompt='User Email', help='Email address of the user') @click.option('--overwrite', is_flag=True, default=False, help='overwrite a repository if it exists at the current path') def init(name, email, overwrite): + """ + Init stockroom repository. A stockroom repository is a hangar repository plus + a `head.stock` that will be tracked by git. + """ try: - repository.init(name, email, overwrite) + init_repo(name, email, overwrite) except RuntimeError as e: - raise click.ClickException(e) + raise click.ClickException(e) # type: ignore + +@main.command() +@click.option('--message', '-m', multiple=True, + help=('The commit message. If multiple arguments are provided, ' + 'each of them gets converted into a new line')) +def commit(message): + """ + It does a stock commit. Stock commit consists of two actions + 1. Make a hangar commit and add the changed data to the repository + 2. Update the `head.stock` file which should be tracked with a git commit + """ + stock = StockRoom() + msg = '\n'.join(message) + click.echo('Commit message:\n' + msg) + try: + digest = stock.commit(message) + except (FileNotFoundError, RuntimeError) as e: + raise click.ClickException(e) # type: ignore + click.echo(f'Commit Successful. Digest: {digest}') diff --git a/stockroom/main.py b/stockroom/main.py new file mode 100644 index 0000000..868a61d --- /dev/null +++ b/stockroom/main.py @@ -0,0 +1,109 @@ +from typing import Union, Any +from pathlib import Path +from contextlib import contextmanager + +from .repository import StockRepository +from .storages import Model, Data, Tag +from .utils import get_stock_root, set_current_head + + +class StockRoom: + """ + This class is the only user entrypoint of stockroom that interacts with an existing + stock repository i.e. all the repository interaction a user would do will have to go + through an object of this class. Also, stockroom comes with three different storages + + 1. Model: Weights of models built with ``keras.Model`` or ``torch.nn`` + 2. Data: Dataset as numpy arrays/tensors + 3. Tag: Information related to an experiment such as metrics, parameters etc + + An object of this class holds an object to these three storages each has a dictionary + style access machinery + + Parameters + ---------- + path : Union[str, Path, None] + Path the to the stock repository. If `None`, it traverse up from `pwd` till it + finds the stock root (stock root is the location where `head.stock` file is + located and ideally will have `.git` folder as well + + Note + ---- + By default (if no path is provided while initializing :class:`StockRoom`), it checks + for the stock root. A stock root is a directory that is + + 1. a git repository (has .git folder) + 2. a hangar repository (has .hangar folder) + 3. a stock repository (has head.stock file) + + If you'd like to skip these checks and just use stockroom (for example: if you are a + hangar user and use stockroom just for storing models in your hangar repository, it + doesn't need to be a stock repository and hence can skip these checks), provide the + path to the repository explicitly. The rationale here is, if you provide the path, we + trust you that you know what you doing on that path + """ + def __init__(self, path: Union[str, Path, None] = None): + self.path = Path(path) if path else get_stock_root(Path.cwd()) + self._repo = StockRepository(self.path) + + self.model = Model(self._repo) + self.data = Data(self._repo) + self.tag = Tag(self._repo) + + @property + def get_hangar_checkout(self, write: bool = False) -> Any: + """ + Fetch the hangar checkout object that's been used by stockroom internally. Don't + do this unless you know what you are doing. Directly interacting with hangar + could tamper the data stored by stockroom if you are not familiar with how hangar + stores data and it's APIs. + + Parameters + ---------- + write : bool + Whether you need a write enabled checkout or not + + Returns + ------ + Union[ReaderCheckout, WriterCheckout] + A hangar checkout object which can be used to interact with the repository + data + + Warning + ------- + You won't be able to fetch a write enabled checkout if you are in ``optimize`` + context manager. Similarly if you fetch a write enabled checkout from here, + you neither be able to do any write operation through stockroom nor be able to + open ``optimize`` context manager + """ + return self._repo.hangar_repository.checkout(write=write) + + @contextmanager + def optimize(self, write=False): + """ + This context manager, on `enter`, asks the :class:`StockRepository` object to + open the global checkout. Global checkout is being stored as property of the + repository singleton. Hence all the downstream tasks will get this opened + checkout until it is closed. This global checkout will be closed on the `exit` of + this context manager + """ + if self._repo.is_optimized: + raise RuntimeError("Attempt to open one optimized checkout while another is " + "in action in the same process") + try: + self._repo.open_global_checkout(write) + yield None + finally: + self._repo.close_global_checkout() + + def commit(self, message: str) -> str: + """ + Make a stock commit. A stock commit is a hangar commit plus writing the commit + hash to the stock file. This function opens the stock checkout in write mode and + close after the commit. Which means, no other write operations should be running + while stock commit is in progress + """ + with self._repo.write_checkout() as co: + digest = co.commit(message) + set_current_head(self._repo.stockroot, digest) + return digest diff --git a/stockroom/parser.py b/stockroom/parser.py index d93658d..5d904be 100644 --- a/stockroom/parser.py +++ b/stockroom/parser.py @@ -1,35 +1,41 @@ -# TODO: best practices like utf8 -# TODO: is this separater enough SEP = '--_' -PREFIX = '_STOCK' +PREFIX = '_STK' -def metakey(model, name): - return f"{PREFIX}_metakey_{model}_{name}" +# =================================================================== +# Metadata & Arrayset key parsers for model store +# =================================================================== +def model_metakey(model, name): + return f"{PREFIX}{SEP}{model}{SEP}{name}" -def model_asetkey_from_details(*args): - # TODO: make more reliable hash rather than time.time() - asetkey = f"{PREFIX}{SEP}" - return asetkey + SEP.join(args) +def modelkey(name, longest, dtype): + return f"{PREFIX}{SEP}{name}{SEP}{longest}{SEP}{dtype}" -def shape_asetkey_from_model_asetkey(model_asetkey): - return model_asetkey + '_shape' +def model_shapekey(name, longest): + return f"{PREFIX}{SEP}{name}{SEP}{longest}{SEP}shape" -# TODO: move this somewhere more sensib -def layers_to_string(layers): - return ','.join(layers) +# =================================================================== +# Metadata Value parsers +# =================================================================== +def stringify(lst): + return ','.join(lst) if lst else '' -def string_to_layers(string): - return string.split(',') +def destringify(string): + return string.split(',') if string else '' -def dtypes_to_string(dtypes): - return ','.join(dtypes) +# =================================================================== +# Tag keys +# =================================================================== -def string_to_dtypes(string): - return string.split(',') +def tagkey(name): + return f"{PREFIX}{SEP}tag{SEP}{name}" + + +def tag_typekey(name): + return f"{PREFIX}{SEP}{name}{SEP}tag{SEP}type" diff --git a/stockroom/repository.py b/stockroom/repository.py index 90e2bc9..6cb3ddd 100644 --- a/stockroom/repository.py +++ b/stockroom/repository.py @@ -1,46 +1,149 @@ from pathlib import Path +from contextlib import contextmanager + from hangar import Repository -from .utils import get_stock_root +from .utils import get_current_head + + +class RootTracker(type): + """ + A metaclass that make sure singleton-like implementation restricted on repository + path. This class checks for the repository path and returns an existing instance of + :class:`StorckRepository` for that path if exists. A path based singleton is + essential since we need the ability to open one write checkout for each repository + and make sure no another attempt to open the write checkout for the same repository + triggers from stockroom. + """ + _instances = {} + + def __call__(cls, root, *args, **kwargs): + if root not in cls._instances: + cls._instances[root] = super().__call__(root, *args, **kwargs) + return cls._instances[root] + + +class StockRepository(metaclass=RootTracker): + """ + A StockRoom wrapper class for hangar repo operations. Every hangar repo interactions + that is being done through stockroom (other than stock init) should go through the + checkout created from this class. Unlike hangar Repository, this class constructor + assumes the hangar repo is already initialized. Hangar will make sure there are only + one writer class active always. The constructor creates the hangar repo object on + instantiation. + """ + + def __init__(self, root): + self._root = root + self._hangar_repo = Repository(root) + self._optimized_Rcheckout = None + self._optimized_Wcheckout = None + self._has_optimized = {'R': False, 'W': False} + + @property + def hangar_repository(self): + return self._hangar_repo + + @property + def is_optimized(self): + return any(self._has_optimized.values()) + + def open_global_checkout(self, write): + head_commit = get_current_head(self._root) + self._optimized_Rcheckout = self._hangar_repo.checkout(commit=head_commit) + self._optimized_Rcheckout.__enter__() + self._has_optimized['R'] = True + if write: + self._optimized_Wcheckout = self._hangar_repo.checkout(write=True) + self._optimized_Wcheckout.__enter__() + self._has_optimized['W'] = True + + def close_global_checkout(self): + self._has_optimized['R'] = False + self._optimized_Rcheckout.__exit__() + self._optimized_Rcheckout.close() + self._optimized_Rcheckout = None + if self._has_optimized['W']: + self._has_optimized['W'] = False + self._optimized_Wcheckout.__exit__() + self._optimized_Wcheckout.close() + self._optimized_Wcheckout = None + + @contextmanager + def read_checkout(self): + """ + An api similar to hangar checkout in read mode but creates the checkout object + using the commit hash from stock file instead of user supplying one. This enables + users to rely on git checkout for hangar checkout as well. This checkout is being + designed as a context manager that makes sure the checkout is closed. On entry + and exit, the CM checks the existence of a global checkout. On entry, if global + checkout exists, it returns the that instead of creating a new checkout. On exit, + it doesn't close in case of global checkout instead it lets the CM do the closure + """ + if self._has_optimized['R']: + co = self._optimized_Rcheckout + else: + head_commit = get_current_head(self._root) + co = self._hangar_repo.checkout(commit=head_commit) + try: + yield co + finally: + if not self._has_optimized['R']: + co.close() + + @contextmanager + def write_checkout(self): + """ + An API similar to hangar checkout in write mode but does the closure of checkout + on the exit of CM. It also monitors the existence of global checkout and open + or close a local checkout if the global checkout doesn't exist + """ + if self._has_optimized['W']: + co = self._optimized_Wcheckout + else: + co = self._hangar_repo.checkout(write=True) + try: + yield co + finally: + if not self._has_optimized['W']: + co.close() + + @property + def stockroot(self) -> Path: + """ + Returns the root of stock repository + """ + return self._root + +# ================================== User facing Repository functions ================================ -def init(name, email, overwrite): +def init_repo(name=None, email=None, overwrite=False): """ init hangar repo, create stock file and add details to .gitignore """ if not Path.cwd().joinpath('.git').exists(): raise RuntimeError("stock init should execute only in a" " git repository. Try running stock " "init after git init") repo = Repository(Path.cwd(), exists=False) - if repo.initialized and (not overwrite): + if not overwrite and repo.initialized: commit_hash = repo.log(return_contents=True)['head'] - print(f'Repo already exists at: {repo.path}') + print(f'Hangar Repo already exists at {repo.path}. ' + f'Initializing it as stock repository') else: + if name is None or email is None: + raise ValueError("Both ``name`` and ``email`` cannot be None") commit_hash = '' repo.init(user_name=name, user_email=email, remove_old=overwrite) + # closing the environment for avoiding issues in windows + repo._env._close_environments() - stock_file = Path.cwd().joinpath('head.stock') + stock_file = Path.cwd()/'head.stock' if not stock_file.exists(): with open(stock_file, 'w+') as f: f.write(commit_hash) print("Stock file created") - gitignore = Path.cwd().joinpath('.gitignore') - # TODO make sure this creates the file when file doesn't exist + gitignore = Path.cwd()/'.gitignore' with open(gitignore, 'a+') as f: f.seek(0) if '.hangar' not in f.read(): - f.write('\n.hangar\n') - - -def commit(message): - repo = Repository(Path.cwd()) - with repo.checkout(write=True) as co: - root = get_stock_root() - if not root: - raise FileNotFoundError("Could not find stock file. Aborting..") - digest = co.commit(message) - with open(root.joinpath('head.stock'), 'w') as f: - f.write(digest) - # TODO: print message about file write as well - # TODO: should be done in the __exit__ of hangar checkout - co.close() - return digest + f.write('\n# hangar artifacts\n.hangar\n') diff --git a/stockroom/storages/__init__.py b/stockroom/storages/__init__.py index b1873b0..570754d 100644 --- a/stockroom/storages/__init__.py +++ b/stockroom/storages/__init__.py @@ -1,4 +1,4 @@ -from .datastore import DataStore -from .modelstore import ModelStore -from .metricstore import MetricStore -from .paramstore import ParamStore +from .data import Data +from .model import Model +from .tag import Tag +__all__ = ['Data', 'Model', 'Tag'] diff --git a/stockroom/storages/data.py b/stockroom/storages/data.py new file mode 100644 index 0000000..3d362c9 --- /dev/null +++ b/stockroom/storages/data.py @@ -0,0 +1,36 @@ + +class Data: + """ + Data storage is essentially a wrapper over hangar's column API which let stockroom + handles the checkout scope. The instance creation is not something user would + directly do here. Instead, a created instance will be available at :class:`stockroom.StockRoom` + + Note + ---- + Each ``__getitem__`` or ``__setitem__`` call will open & close a hangar checkout. + Unlike other storages, this is a crucial information for data storage because both + reading and writing of data happens quite frequently in a pipeline unlike saving or + retrieving model or parameters or metrics. So for optimizing, this you could make the + data read/write inside the context manager :meth:`stockroom.StockRoom.optimize` + + Examples + -------- + >>> stock = StockRoom() + >>> stock.data['column1', 'sample1'] = np.arange(20).reshape(5, 4) + >>> sample = stock.data['column1', 'sample5'] + + Inside context manager + + >>> with stock.optimize(): + ... sample = stock.data['coloumn1', 'sample1'] + """ + def __init__(self, repo): + self._repo = repo + + def __setitem__(self, key, value): + with self._repo.write_checkout() as co: + co[key] = value + + def __getitem__(self, key): + with self._repo.read_checkout() as co: + return co[key] diff --git a/stockroom/storages/datastore.py b/stockroom/storages/datastore.py deleted file mode 100644 index 08996e2..0000000 --- a/stockroom/storages/datastore.py +++ /dev/null @@ -1,32 +0,0 @@ -from .storagebase import StorageBase -from ..utils import get_current_head, get_stock_root - - -class DataStore(StorageBase): - def __init__(self): - super().__init__() - - def __getitem__(self, item): - root = get_stock_root() - dset = self.repo.checkout(commit=get_current_head(root)) - # TODO: rigorous check like in hangar - if isinstance(item, tuple): - aset = item[0] - index = item[1] - return dset[aset, index] - return dset[item] - - def __setitem__(self, item, value): - # TODO: optimized set item like context manager - dset = self.repo.checkout(write=True) - # TODO: rigorous check like in hangar - if isinstance(item, tuple): - aset = item[0] - index = item[1] - dset[aset, index] = value - dset[item] = value # this will raise error downstream - dset.close() - - - - diff --git a/stockroom/storages/metricstore.py b/stockroom/storages/metricstore.py deleted file mode 100644 index a8c6b93..0000000 --- a/stockroom/storages/metricstore.py +++ /dev/null @@ -1,2 +0,0 @@ -class MetricStore: - pass diff --git a/stockroom/storages/model.py b/stockroom/storages/model.py new file mode 100644 index 0000000..f9bb176 --- /dev/null +++ b/stockroom/storages/model.py @@ -0,0 +1,178 @@ +import warnings + +import numpy as np + +from .. import parser +from ..utils import LazyLoader + +torch = LazyLoader('torch', globals(), 'torch') +tf = LazyLoader('tf', globals(), 'tensorflow') + + +class Model: + """ + Model class utilizes hangar columns to store pieces of a model and use hangar + metadata to store the information required to collate it back to a model. Currently, + it supports ``keras.Model`` and ``torch.nn.Module`` models. ModelStore instance, + on :meth:`stockroom.storages.Model.save_weights` creates few columns (one column for + each data type) to store the weights and create one column specifically to store the + shape of each layer. This shape column is needed because the weights of each layer + would be flattened before saving. This is essential since handling variable shapes + and variable ranks are more complex than flattening and reshaping-back the weights. + + Examples + -------- + >>> import torch + >>> import tensorflow as tf + >>> torch_model = torch.Sequential(...) + >>> stock.model['torch_model'] = torch_model.state_dict() + >>> tf_model = tf.Keras.Sequential() + >>> tf_model.add(tf.layers.Dense(64, activation='relu')) + >>> stock.model['tf_model'] = tf_model.get_weights() + + But if you can make it easy by calling special functions that knows how to fetch + weights from the model or how to put weights back to model. Checkout :meth:`Model.save_weights` + & :meth:`Model.load_weights` for more details + """ + def __init__(self, repo): + self._repo = repo + + def __setitem__(self, name, weights): + if isinstance(weights, dict): + layers = weights.keys() + weights = [x.numpy() for x in weights.values()] + library = 'torch' + library_version = torch.__version__ + elif isinstance(weights, list): + library = 'tf' + layers = None + library_version = tf.__version__ + else: + raise TypeError("Unknown type. Weights has to be a dict or list") + longest = max([len(x.reshape(-1)) for x in weights]) + dtypes = [w.dtype.name for w in weights] + + with self._repo.write_checkout() as co: + co.metadata[parser.model_metakey(name, 'library')] = library + co.metadata[parser.model_metakey(name, 'libraryVersion')] = library_version + co.metadata[parser.model_metakey(name, 'longest')] = str(longest) + co.metadata[parser.model_metakey(name, 'dtypes')] = parser.stringify(dtypes) + co.metadata[parser.model_metakey(name, 'numLayers')] = str(len(weights)) + co.metadata[parser.model_metakey(name, 'layers')] = parser.stringify(layers) + + # ---------- Create arraysets if not exist ----------------- + shapeKey = parser.model_shapekey(name, str(longest)) + if shapeKey not in co.arraysets.keys(): + shape_typ = np.array(1).dtype # C long = int32 in win64; int64 elsewhere + co.arraysets.init_arrayset(shapeKey, 10, shape_typ, variable_shape=True) + for i, w in enumerate(weights): + modelKey = parser.modelkey(name, str(longest), dtypes[i]) + if modelKey not in co.arraysets.keys(): + co.arraysets.init_arrayset( + modelKey, longest, np.dtype(dtypes[i]), variable_shape=True) + # --------------------------------------------------------- + + shape_aset = co.arraysets[shapeKey] + for i, w in enumerate(weights): + model_aset = co.arraysets[parser.modelkey(name, longest, dtypes[i])] + model_aset[i] = w.reshape(-1) + if w.shape: + shape_aset[i] = np.array(w.shape) + else: + # C long = int32 in win64; int64 elsewhere + shape_typ = np.array(1).dtype + shape_aset[i] = np.array(()).astype(shape_typ) + + def __getitem__(self, name): + with self._repo.read_checkout() as co: + try: + library = co.metadata[parser.model_metakey(name, 'library')] + except KeyError: + raise KeyError(f"Model with key {name} not found") + library_version = co.metadata[parser.model_metakey(name, 'libraryVersion')] + longest = int(co.metadata[parser.model_metakey(name, 'longest')]) + dtypes = parser.destringify(co.metadata[parser.model_metakey(name, 'dtypes')]) + num_layers = int(co.metadata[parser.model_metakey(name, 'numLayers')]) + layers = parser.destringify(co.metadata[parser.model_metakey(name, 'layers')]) + + shapeKey = parser.model_shapekey(name, longest) + shape_aset = co.arraysets[shapeKey] + weights = [] + for i in range(num_layers): + modelKey = parser.modelkey(name, longest, dtypes[i]) + aset = co.arraysets[modelKey] + w = aset[i].reshape(np.array(shape_aset[i])) + weights.append(w) + if library == 'torch': + if torch.__version__ != library_version: + warnings.warn(f"PyTorch version used while storing the model " + f"({library_version}) is not same as the one installed " + f"in the current environment. i.e {torch.__version__}") + return {layers[i]: torch.from_numpy(weights[i]) for i in range(num_layers)} + + else: + if tf.__version__ != library_version: + warnings.warn(f"Tensorflow version used while storing the model " + f"({library_version}) is not same as the one installed " + f"in the current environment. i.e {tf.__version__}") + return weights + + def save_weights(self, name, model): + """ + A convenient function to call when you don't want to deal with weight extraction + from the model, regardless of which framework do you use to write model, as far + as that framework is supported by stockroom. This function expects the model + object from one of the supported framework. This will call the corresponding + function of that framework to fetch the weights and then call :meth:`Model.__setitem__` + to save the weights. + + Parameters + ---------- + name : str + Name of the key to which the model parameters are saved + model : Any + Object from any supported framework + + Examples + -------- + >>> stock.model.save_weights('torch_model', torch_model) + + """ + if hasattr(model, 'state_dict'): + weights = model.state_dict() + elif hasattr(model, 'get_weights'): + weights = model.get_weights() + else: + raise TypeError("Unknown model type. StockRoom can work with only " + "``Keras.Model`` or ``torch.nn.Module`` modules") + self[name] = weights + + def load_weights(self, name, model): + """ + Load the parameters from hangar repo, put it back to the model object. It looks + for all the columns that matches the model name and reshape it back to the actual + shape (actual shape is stored in another column). Different frameworks has + different way of loading the parameter to model object. For identifying this, + :meth:`Model.save_weights` also saves the framework name while saving the model + + Parameters + ---------- + name : str + Name of the key from which the model parameters are loaded + model : Any + Model object from any supported framework onto which the parameters are + loaded. Loading the parameters is an inplace operation and hence this + function doesn't return anything + + Examples + -------- + >>> stock.model.load_weights('torch_model', torch_model) + """ + weights = self[name] + if hasattr(model, 'load_state_dict'): + model.load_state_dict(weights) + elif hasattr(model, 'set_weights'): + model.set_weights(weights) + else: + raise TypeError("Unknown model type. StockRoom can work with only " + "``Keras.Model`` or ``torch.nn.Module`` modules") diff --git a/stockroom/storages/modelstore.py b/stockroom/storages/modelstore.py deleted file mode 100644 index 796371c..0000000 --- a/stockroom/storages/modelstore.py +++ /dev/null @@ -1,97 +0,0 @@ -import numpy as np - -from .storagebase import StorageBase -from ..utils import get_current_head, get_stock_root -from .. import parser - - -def get_aset(co, name, dtype=None, longest=None, variable=False): - try: - aset = co.arraysets[name] - return aset - except KeyError: - pass - aset = co.arraysets.init_arrayset( - name, dtype=np.dtype(dtype), shape=(longest,), variable_shape=variable) - return aset - -# TODO: figure out what' the importance of max shape if var_shape is True - - -class ModelStore(StorageBase): - def __init__(self): - super().__init__() - - def save(self, name, model): - # TODO: optimize - co = self.repo.checkout(write=True) - if hasattr(model, 'state_dict'): - library = 'torch' - state = model.state_dict() - layers = list(state.keys()) - # TODO: forloop for all needs or list comprehension few times - weights = [x.numpy() for x in state.values()] - str_layer = parser.layers_to_string(layers) - co.metadata[parser.metakey(name, 'layers')] = str_layer - elif hasattr(model, 'get_weights'): - library = 'tf' - # tf model - weights = model.get_weights() - else: - raise TypeError("Unknown model type. StockRoom can work with only " - "``Keras.Model`` or ``torch.nn.Module`` modules") - longest = max([len(x.reshape(-1)) for x in weights]) - co.metadata[parser.metakey(name, 'library')] = library - co.metadata[parser.metakey(name, 'longest')] = str(longest) - co.metadata[parser.metakey(name, 'num_layers')] = str(len(weights)) - dtypes = [w.dtype.name for w in weights] - str_dtypes = parser.dtypes_to_string(dtypes) - co.metadata[parser.metakey(name, 'dtypes')] = str_dtypes - aset_prefix = parser.model_asetkey_from_details(name, str(longest)) - co.metadata[parser.metakey(name, 'aset_prefix')] = aset_prefix - shape_asetn = parser.shape_asetkey_from_model_asetkey(name) - shape_aset = co.arraysets.init_arrayset( - shape_asetn, shape=(10,), dtype=np.int64, variable_shape=True) - for i, w in enumerate(weights): - asetn = parser.model_asetkey_from_details(aset_prefix, dtypes[i]) - aset = get_aset(co, asetn, dtypes[i], longest, variable=True) - aset[i] = w.reshape(-1) - if w.shape: - shape_aset[i] = np.array(w.shape) - else: - shape_aset[i] = np.array(()).astype('int64') - co.close() - - def load(self, name, model): - import torch - root = get_stock_root() - head_commit = get_current_head(root) - co = self.repo.checkout(commit=head_commit) - aset_prefix = co.metadata[parser.metakey(name, 'aset_prefix')] - dtypes = parser.string_to_dtypes(co.metadata[parser.metakey(name, 'dtypes')]) - library = co.metadata[parser.metakey(name, 'library')] - num_layers = int(co.metadata[parser.metakey(name, 'num_layers')]) - weights = [] - for i in range(num_layers): - asetn = parser.model_asetkey_from_details(aset_prefix, dtypes[i]) - aset = get_aset(co, asetn) - shape_asetn = parser.shape_asetkey_from_model_asetkey(name) - shape_aset = co.arraysets[shape_asetn] - w = aset[i].reshape(np.array(shape_aset[i])) - weights.append(w) - if len(weights) != num_layers: - raise RuntimeError("Critical: length doesn't match. Raise an issue") - if library == 'torch': - str_layers = co.metadata[parser.metakey(name, 'layers')] - layers = parser.string_to_layers(str_layers) - if len(layers) != num_layers: - raise RuntimeError("Critical: length doesn't match. Raise an issue") - state = {layers[i]: torch.from_numpy(weights[i]) for i in range(num_layers)} - model.load_state_dict(state) - else: - model.set_weights(weights) - - - - - diff --git a/stockroom/storages/paramstore.py b/stockroom/storages/paramstore.py deleted file mode 100644 index 62e91cb..0000000 --- a/stockroom/storages/paramstore.py +++ /dev/null @@ -1,2 +0,0 @@ -class ParamStore: - pass \ No newline at end of file diff --git a/stockroom/storages/storagebase.py b/stockroom/storages/storagebase.py deleted file mode 100644 index 3836f8b..0000000 --- a/stockroom/storages/storagebase.py +++ /dev/null @@ -1,14 +0,0 @@ -from hangar import Repository -from ..utils import get_stock_root - - -class StorageBase(object): - - def __init__(self): - if not hasattr(StorageBase, 'repo'): - root = get_stock_root() - if root is None: - raise RuntimeError("Could not find the stock root. " - "Did you forget to `stock init`?") - StorageBase.root = root - StorageBase.repo = Repository(root) diff --git a/stockroom/storages/tag.py b/stockroom/storages/tag.py new file mode 100644 index 0000000..3c7b98b --- /dev/null +++ b/stockroom/storages/tag.py @@ -0,0 +1,49 @@ +from .. import parser + + +class Tag: + """ + Tag store, as the name suggests, is to store tags related to an experiment. Ideally/ + eventually this store information on commit level and would not pass it down the + commit history tree. But currently the internal implementation of hangar doesn't + allow that and hence we store the information on metadata store in hangar. It + currently takes `int`, `float` & `str` data types and convert it to a string which + is the only data type supported by hangar metadata. But :class:`Tag` stores the type + of the data in another metadata "column" which will be uesd while pulling the data + back from the Tag store. + + Examples + -------- + >>> stock.tag['epochs'] = 1000 + >>> stock.tag['lr'] = 0.0001 + >>> stock.tag['optimizer'] = 'adam' + """ + def __init__(self, repo): + self.typecaster = {'int': int, 'float': float, 'str': str} + self._repo = repo + + def __setitem__(self, key, value): + with self._repo.write_checkout() as co: + if isinstance(value, int): + value_type = 'int' + elif isinstance(value, float): + value_type = 'float' + elif isinstance(value, str): + value_type = 'str' + else: + raise TypeError("Tag store can accept only ``int``, ``float`` or ``str``") + co.metadata[parser.tagkey(key)] = str(value) + co.metadata[parser.tag_typekey(key)] = value_type + + def __getitem__(self, key): + with self._repo.read_checkout() as co: + try: + value = co.metadata[parser.tagkey(key)] + value_type = co.metadata[parser.tag_typekey(key)] + except KeyError: + raise KeyError(f"Data not found with key {key}") + try: + return self.typecaster[value_type](value) + except KeyError: + raise KeyError(f"Data tampering suspected. Could not " + f"read the data type {value_type}") diff --git a/stockroom/utils.py b/stockroom/utils.py index 893c1c6..58a9522 100644 --- a/stockroom/utils.py +++ b/stockroom/utils.py @@ -1,9 +1,25 @@ +import types +import importlib from pathlib import Path -def get_stock_root(): - # TODO: would CWD work always - path = Path.cwd() +def get_stock_root(path: Path) -> Path: + """ + Traverse from given path up till root of the system to figure out the root of the + stock repo. A stock repo must be hangar repo, a git repo and must have a head.stock + file. The head.stock file has the information required for stockroom to manage + checkout, branching etc and it is git tracked. + + Parameters + ---------- + path : Path + path from which stock root check starts + + Returns + ------- + Path + Location of root of stock repo + """ while True: stock_exist = path.joinpath('head.stock').exists() if stock_exist: @@ -14,18 +30,73 @@ def get_stock_root(): "hangar repository") return path if path == path.parent: # system root check - return None + raise RuntimeError("Could not find stock root. Are you in a " + "stock repository") path = path.parent -def get_current_head(root: Path): - head = root.joinpath('head.stock') - with open(head, 'r') as f: +def get_current_head(root: Path) -> str: + """ + Reads the stock file and return the commit hash if found + + Parameters + ---------- + root : Path + The stock root path + + Returns + ------- + str + commit hash if found. Empty string other wises + """ + with open(root/'head.stock', 'r') as f: commit = f.read() return commit if commit else '' def set_current_head(root: Path, commit: str): - head = root.joinpath('head.stock') - with open(head, 'w+') as f: + """ + Write a commit hash to the stock file. + + Parameters + ---------- + root : Path + The stock root path + commit : str + Commit hash that will be written to the stock file + """ + with open(root/'head.stock', 'w+') as f: f.write(commit) + + +class LazyLoader(types.ModuleType): + """ + Lazily import a module, mainly to avoid pulling in large dependencies + """ + + def __init__(self, local_name, parent_module_globals, name): + self._local_name = local_name + self._parent_module_globals = parent_module_globals + super(LazyLoader, self).__init__(name) + + def _load(self): + """Load the module and insert it into the parent's globals. + + Import the target module and insert it into the parent's namespace + Update this object's dict so that if someone keeps a reference to the + LazyLoader, lookups are efficient (__getattr__ is only called on + lookups that fail). + """ + module = importlib.import_module(self.__name__) + self._parent_module_globals[self._local_name] = module + self.__dict__.update(module.__dict__) + return module + + def __getattr__(self, item): + module = self._load() + return getattr(module, item) + + def __dir__(self): + module = self._load() + return dir(module) + diff --git a/tests/.gitignore b/tests/.gitignore deleted file mode 100644 index 9e2c5e8..0000000 --- a/tests/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ - -.hangar diff --git a/tests/conftest.py b/tests/conftest.py index bfde459..5d40797 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,16 +1,51 @@ from pathlib import Path import shutil import pytest -import stockroom +import hangar +import numpy as np +import lmdb +import stockroom.repository +from stockroom import init_repo, StockRoom -# TODO: restructure the core and monkey patch the repo creation at CWD + +@pytest.fixture() +def managed_tmpdir(monkeypatch, tmp_path): + monkeypatch.setitem(hangar.constants.LMDB_SETTINGS, 'map_size', 2_000_000) + monkeypatch.setitem(hangar.constants.LMDB_SETTINGS, 'map_size', 2_000_000) + monkeypatch.setattr(hangar.backends.hdf5_00, 'COLLECTION_COUNT', 10) + monkeypatch.setattr(hangar.backends.hdf5_00, 'COLLECTION_SIZE', 50) + monkeypatch.setattr(hangar.backends.hdf5_01, 'COLLECTION_COUNT', 10) + monkeypatch.setattr(hangar.backends.hdf5_01, 'COLLECTION_SIZE', 50) + monkeypatch.setattr(hangar.backends.numpy_10, 'COLLECTION_SIZE', 50) + stockroom.repository.RootTracker._instances = {} + yield tmp_path + shutil.rmtree(tmp_path) @pytest.fixture() -def repo(): - yield stockroom.init('s', 'a@b.c', overwrite=True) - cwd = Path.cwd() - shutil.rmtree(cwd.joinpath('.hangar')) - cwd.joinpath('head.stock').unlink() +def repo(monkeypatch, managed_tmpdir): + cwd = Path(managed_tmpdir) + monkeypatch.setattr(Path, 'cwd', lambda: cwd) + cwd.joinpath(".git").mkdir() + cwd.joinpath(".gitignore").touch() + init_repo('s', 'a@b.c', overwrite=True) + yield None + stock = StockRoom() + try: + stock._repo.hangar_repository._env._close_environments() + except lmdb.Error: + pass # environment already closed by downstream functions +@pytest.fixture() +def repo_with_aset(repo): + repo = hangar.Repository(Path.cwd()) + co = repo.checkout(write=True) + arr = np.arange(20).reshape(4, 5) + co.arraysets.init_arrayset('aset', prototype=arr) + co.commit('init aset') + co.close() + yield None + repo._env._close_environments() + stock = StockRoom() + stock._repo.hangar_repository._env._close_environments() diff --git a/tests/test_data.py b/tests/test_data.py new file mode 100644 index 0000000..c2aa3ba --- /dev/null +++ b/tests/test_data.py @@ -0,0 +1,31 @@ +import pytest +from stockroom import StockRoom +import numpy as np + + +def test_save_data(repo_with_aset): + arr = np.arange(20).reshape(4, 5) + stock = StockRoom() + stock.data['aset', 1] = arr + stock.commit("added data") + assert np.allclose(stock.data['aset', 1], arr) + + +def test_save_to_non_existing_column(repo): + arr = np.arange(20).reshape(4, 5) + stock = StockRoom() + with pytest.raises(KeyError): + stock.data['wrongaset', 1] = arr + + +def test_save_to_different_typed_column(repo_with_aset): + arr = np.arange(20).reshape(4, 5).astype(np.float) + stock = StockRoom() + with pytest.raises(ValueError): + stock.data['aset', 1] = arr + + +def test_fetch_non_existing_sample_key(repo_with_aset): + stock = StockRoom() + with pytest.raises(KeyError): + stock.data['aset', 1] diff --git a/tests/test_model.py b/tests/test_model.py new file mode 100644 index 0000000..8658278 --- /dev/null +++ b/tests/test_model.py @@ -0,0 +1,93 @@ +import pytest +import numpy as np +from stockroom import StockRoom +from copy import deepcopy + + +class TestTFModelStore: + + @staticmethod + def get_new_model(): + import tensorflow as tf + tf_model = tf.keras.models.Sequential([ + tf.keras.layers.Dense(3, activation='relu'), + tf.keras.layers.Dense(1, activation='relu') + ]) + tf_model.build((5, 2)) + return tf_model + + @pytest.mark.filterwarnings('ignore:the imp module is deprecated:DeprecationWarning') + def test_saving_tf(self, repo): + stock = StockRoom() + + tf_model = self.get_new_model() + old_weights = tf_model.get_weights() + stock.model.save_weights('tf_model', tf_model) + stock.commit("adding tf model") + + tf_model = self.get_new_model() + tmp_weights = deepcopy(tf_model.get_weights()) + stock.model.load_weights('tf_model', tf_model) + new_weights = deepcopy(tf_model.get_weights()) + for k in range(len(old_weights)): + assert np.allclose(old_weights[k], new_weights[k]) + # bias is initiated as zero in tensorflow hence + # the tmp and new both will be zero + assert not tmp_weights[k].sum() == 0.0 \ + or np.allclose(tmp_weights[k], new_weights[k]) + + +class TestTorchModelStore: + + @staticmethod + def get_new_model(): + import torch + torch_model = torch.nn.Sequential( + torch.nn.Linear(2, 3), + torch.nn.ReLU(), + torch.nn.Linear(3, 1)) + return torch_model + + def test_saving_torch(self, repo): + stock = StockRoom() + + torch_model = self.get_new_model() + old_state = torch_model.state_dict() + stock.model.save_weights('torch_model', torch_model) + stock.commit('adding torch model') + + torch_model = self.get_new_model() + tmp_state = deepcopy(torch_model.state_dict()) + stock.model.load_weights('torch_model', torch_model) + new_state = deepcopy(torch_model.state_dict()) + for k in old_state.keys(): + assert np.allclose(old_state[k], new_state[k]) + assert not np.allclose(tmp_state[k], new_state[k]) + + def test_load_with_different_library_version(self, repo, monkeypatch): + import torch + stock = StockRoom() + torch_model = self.get_new_model() + stock.model.save_weights('thm', torch_model) + stock.commit("adding th model") + with monkeypatch.context() as m: + m.setattr(torch, '__version__', '0.9') + with pytest.warns(UserWarning) as warning_rec: + stock.model.load_weights('thm', torch_model) + assert len(warning_rec) == 1 + assert "PyTorch version used" in warning_rec[0].message.args[0] + + def test_unknown_model_type(self, repo): + stock = StockRoom() + with pytest.raises(TypeError): + stock.model.save_weights('invalid', {}) + + def test_load_nonexisting_key(self, repo): + stock = StockRoom() + thm = self.get_new_model() + stock.model.save_weights('thm', thm) + stock.commit("adding th model") + with pytest.raises(KeyError) as error: + stock.model.load_weights('wrongname', thm) + assert 'Model with key wrongname not found' == error.value.args[0] + stock._repo._hangar_repo._env._close_environments() diff --git a/tests/test_modelstore.py b/tests/test_modelstore.py deleted file mode 100644 index d724e74..0000000 --- a/tests/test_modelstore.py +++ /dev/null @@ -1,32 +0,0 @@ -import torch -import numpy as np -from stockroom import ModelStore -import stockroom - - -def get_torch_model(): - torch_model = torch.nn.Sequential( - torch.nn.Linear(2, 3), - torch.nn.ReLU(), - torch.nn.Linear(3, 1)) - return torch_model - - -def get_tf_model(): - pass - - -def test_saving_torch(repo): - modelstore = ModelStore() - - torch_model = get_torch_model() - old_state = torch_model.state_dict() - modelstore.save('torch_model', torch_model) - stockroom.commit('adding torch model') - - torch_model = get_torch_model() - tmp_state = torch_model.state_dict() - modelstore.load('torch_model', torch_model) - new_state = torch_model.state_dict() - for k in old_state.keys(): - assert np.allclose(old_state[k], new_state[k]) diff --git a/tests/test_multiple_instances.py b/tests/test_multiple_instances.py new file mode 100644 index 0000000..2d62ef9 --- /dev/null +++ b/tests/test_multiple_instances.py @@ -0,0 +1,78 @@ +from stockroom import StockRoom +from random import randint +import numpy as np +import pytest + + +class TestSameProcess: + + def test_opening_two_instances(self, repo_with_aset): + # todo: should we allow this? + stk1 = StockRoom() + stk2 = StockRoom() + arr = np.arange(20).reshape(4, 5) + oldarr = arr * randint(1, 100) + newarr = arr * randint(1, 100) + + stk1.data['aset', 1] = oldarr + stk2.data['aset', 1] = newarr + stk1.commit('added data') + + assert np.allclose(stk2.data['aset', 1], newarr) + assert not np.allclose(stk2.data['aset', 1], oldarr) + + def test_operating_one_in_another_write_contextmanager(self, repo_with_aset): + stk1 = StockRoom() + stk2 = StockRoom() + arr = np.arange(20).reshape(4, 5) + oldarr = arr * randint(1, 100) + + with stk1.optimize(write=True): + assert stk1._repo._optimized_Rcheckout is not None + assert stk1._repo._optimized_Wcheckout is not None + assert stk2._repo._optimized_Rcheckout is not None + assert stk2._repo._optimized_Wcheckout is not None + stk2.data['aset', 1] = oldarr + stk1.commit('adding data inside cm') + with pytest.raises(KeyError): + # TODO: document this scenario + data = stk1.data['aset', 1] + + stk3 = StockRoom() + assert stk3._repo._optimized_Rcheckout is not None + assert stk3._repo._optimized_Wcheckout is not None + + assert np.allclose(oldarr, stk1.data['aset', 1]) + + def test_opening_one_contextmanager_in_another(self, repo_with_aset): + stk1 = StockRoom() + stk2 = StockRoom() + + with stk1.optimize(write=True): + with pytest.raises(RuntimeError): + with stk2.optimize(): + pass + assert stk2._repo._optimized_Rcheckout is not None + assert stk2._repo._optimized_Wcheckout is not None + assert stk1._repo._optimized_Rcheckout is None + assert stk1._repo._optimized_Wcheckout is None + assert stk2._repo._optimized_Rcheckout is None + assert stk2._repo._optimized_Wcheckout is None + + def test_one_inside_another_read_contextmanager(self, repo_with_aset): + stk1 = StockRoom() + stk2 = StockRoom() + arr = np.arange(20).reshape(4, 5) + + with stk1.optimize(): + # non-optimized write inside read CM + assert stk2._repo._optimized_Wcheckout is None + stk2.data['aset', 1] = arr + stk2.commit('adding data') + + with pytest.raises(RuntimeError): + with stk2.optimize(write=True): + pass + + with stk1.optimize(): + assert np.allclose(stk2.data['aset', 1], arr) \ No newline at end of file diff --git a/tests/test_repository.py b/tests/test_repository.py new file mode 100644 index 0000000..6bb8931 --- /dev/null +++ b/tests/test_repository.py @@ -0,0 +1,68 @@ +from pathlib import Path +from stockroom import StockRoom, init_repo +import pytest +import hangar + + +class TestInit: + @staticmethod + @pytest.fixture(autouse=False) + def repo_path(monkeypatch, managed_tmpdir): + path = Path(managed_tmpdir) + monkeypatch.setattr(Path, 'cwd', lambda: path) + return path + + @staticmethod + @pytest.fixture(autouse=False) + def cwd(repo_path): + repo_path.joinpath('.git').mkdir() + return repo_path + + def test_init(self, repo_path): + cwd = repo_path + cwd.joinpath(".git").mkdir() + init_repo('s', 'a@b.c', overwrite=True) + with open(cwd.joinpath('.gitignore')) as f: + assert '\n.hangar\n' in f.read() + assert cwd.joinpath('.hangar').exists() + assert cwd.joinpath('head.stock').exists() + + def test_init_on_non_git_folder(self, repo_path): + with pytest.raises(RuntimeError): + init_repo('s', 'a@b.c', overwrite=True) + + def test_stock_init_on_existing_hangar_repo(self, cwd): + repo = hangar.Repository(cwd, exists=False) + repo.init('a', 'a@b.c') + # TODO: It's better to have the `close_environment` as public attribute in hangar + repo._env._close_environments() + assert not cwd.joinpath('head.stock').exists() + init_repo() + assert cwd.joinpath('head.stock').exists() + with open(cwd.joinpath('.gitignore')) as f: + assert '\n.hangar\n' in f.read() + + +class TestCommit: + def test_basic(self, repo): + stock = StockRoom() + stock.tag['key1'] = 'value' + stock.commit('generic data') + assert stock.tag['key1'] == 'value' + stock.tag['key2'] = 'value2' + with pytest.raises(KeyError): + stock.tag['key2'] + + def test_commit_hash(self, repo): + stock = StockRoom() + stock.tag['key1'] = 'value' + stock.commit('generic data') + with open(stock._repo.stockroot/'head.stock') as f: + digest1 = f.read() + stock.tag['key2'] = 'value2' + stock.commit('generic data 2') + with open(stock._repo.stockroot/'head.stock') as f: + digest2 = f.read() + log = stock._repo._hangar_repo.log(return_contents=True) + assert log['order'] == [digest2, digest1] + diff --git a/tests/test_tag.py b/tests/test_tag.py new file mode 100644 index 0000000..d36d93f --- /dev/null +++ b/tests/test_tag.py @@ -0,0 +1,20 @@ +from stockroom import StockRoom +import pytest + + +def test_basic(repo): + stock = StockRoom() + stock.tag['lr'] = 0.01 + stock.tag['epochs'] = 500 + stock.tag['optimizer'] = 'adam' + stock.commit('Saved lr') + assert stock.tag['lr'] == 0.01 + assert stock.tag['optimizer'] == 'adam' + assert stock.tag['epochs'] == 500 + + +def test_save_string(repo): + stock = StockRoom() + with pytest.raises(TypeError): + stock.tag['wrongdata'] = bytes('hi') + diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000..957c920 --- /dev/null +++ b/tox.ini @@ -0,0 +1,82 @@ +[tox] +envlist = py36, py37, mypy, docs, report, clean + +# ------------- Dependancy setup --------------------- +[deps] +deps = + pytest + pytest-xdist + pytest-cov + pytest-travis-fold + tensorflow + +[mldeps-py36] +deps = + {[deps]deps} + https://download.pytorch.org/whl/cpu/torch-1.3.1%2Bcpu-cp36-cp36m-win_amd64.whl ; sys_platform == 'win32' + torch == 1.3.1 ; sys_platform == 'darwin' + https://download.pytorch.org/whl/cpu/torch-1.3.1%2Bcpu-cp36-cp36m-linux_x86_64.whl ; sys_platform == 'linux' + +[mldeps-py37] +deps = + {[deps]deps} + https://download.pytorch.org/whl/cpu/torch-1.3.1%2Bcpu-cp37-cp37m-win_amd64.whl ; sys_platform == 'win32' + torch == 1.3.1 ; sys_platform == 'darwin' + https://download.pytorch.org/whl/cpu/torch-1.3.1%2Bcpu-cp37-cp37m-linux_x86_64.whl ; sys_platform == 'linux' + +# ------------- Environments --------------------- + +[testenv] +setenv = + PYTHONPATH = {toxinidir} +passenv = + * + +[testenv:py36] +basepython = {env:TOXPYTHON:python3.6} +deps = + {[mldeps-py36]deps} +commands = + pytest --basetemp={envtmpdir} --numprocesses=2 --cov --cov-report=term-missing + +[testenv:py37] +basepython = {env:TOXPYTHON:python3.7} +deps = + {[mldeps-py37]deps} +commands = + pytest --basetemp={envtmpdir} --numprocesses=2 --cov --cov-report=term-missing -v + +[testenv:mypy] +basepython = {env:TOXPYTHON:python3.7} +skip_install = True +commands = + mypy --config-file mypy.ini stockroom +deps = + {[mldeps-py37]deps} + mypy >= 0.701 + mypy-protobuf + grpcio_tools + +[testenv:docs] +usedevelop = true +deps = + -r{toxinidir}/docs/requirements.txt + {[mldeps-py37]deps} +commands = + sphinx-build {posargs:-E} -b html docs dist/docs + sphinx-build -b linkcheck docs dist/docs -j4 + +[testenv:report] +deps = + coverage +skip_install = true +commands = + coverage report + coverage html + +[testenv:clean] +skip_install = true +deps = + coverage +commands = + coverage erase \ No newline at end of file