Cleaning up getting started

whisk-ml · May 8, 2020 · 340aa78 · 340aa78
1 parent 44c54e6
commit 340aa78
Show file tree

Hide file tree

Showing 10 changed files with 43 additions and 179 deletions.
diff --git a/whisk/cli/commands/setup.py b/whisk/cli/commands/setup.py
@@ -1,5 +1,6 @@
 import click
 from whisk.project import Project
+import whisk.git as git
 import os
 import subprocess
 
@@ -42,7 +43,7 @@ def exec_setup(nbenv):
     # direnv will fail if not installed
     os.system("cp .envrc.example .envrc")
     os.system("direnv allow . > /dev/null 2>&1")
-    if has_unstaged_changes():
+    if git.has_unstaged_changes():
         exec("Adding files to git", "git add .")
         exec("Making initial Git commit", "git commit -m 'Initial project structure' --author=\"Whisk <whisk@whisk-ml.org>\" > /dev/null")
 

diff --git a/...{ cookiecutter.repo_name }}/core/utils.py → whisk/dvc.py b/...{ cookiecutter.repo_name }}/core/utils.py → whisk/dvc.py
@@ -1,9 +1,7 @@
 import os
 from subprocess import check_output
-from os.path import dirname, realpath
-from pathlib import Path
 
-def dvc_pull(dvc_file):
+def pull(dvc_file):
     """
     Pulls the output of the specified `dvc_file` into the repository.
     This is useful when running outside the local environment (like a deployed web server)
@@ -17,23 +15,3 @@ def dvc_pull(dvc_file):
     os.system("git init")
     # Pull the training output (the serialized model) when running on a deployed server.
     check_output(["dvc", "pull", dvc_file])
-
-def project_dir():
-    """
-    Returns a string w/the full path to root project directory.
-    """
-    filepath = realpath(__file__)
-    dir_of_file = dirname(filepath)
-    parent_dir_of_file = dirname(dir_of_file)
-    return parent_dir_of_file
-
-def project_dir_name():
-    """
-    Returns a string w/the name of the project directory.
-    """
-    p = Path(project_dir())
-    return p.name
-
-def has_unstaged_changes():
-    res=check_output("git status --porcelain",shell=True, universal_newlines=True)
-    return ("\n" in res)
diff --git a/whisk/git.py b/whisk/git.py
@@ -0,0 +1,6 @@
+from subprocess import check_output
+
+def has_unstaged_changes():
+    """Returns True if the git repo in the current directory has unstaged changes."""
+    res=check_output("git status --porcelain",shell=True, universal_newlines=True)
+    return ("\n" in res)
diff --git a/whisk/model_stub.py b/whisk/model_stub.py
@@ -0,0 +1,12 @@
+class ModelStub:
+    # DELETE ME - Remove this class when you load a real model. This only
+    # exists so the ModelWrapper works when creating an initial project.
+    """
+    A placeholder for a real ML Model. Returns the number of features in each row.
+
+    Example:
+
+        ModelStub().predict([[1,2],[3,4]]) => [2,2]
+    """
+    def predict(self,X):
+        return list(map(lambda instance: len(instance), X))
diff --git a/whisk/template/{{ cookiecutter.repo_name }}/app/main.py b/whisk/template/{{ cookiecutter.repo_name }}/app/main.py
@@ -1,5 +1,5 @@
 import flask
-from {{cookiecutter.project_name}}.core.utils import dvc_pull
+import whisk.dvc as dvc
 from {{cookiecutter.project_name}}.models.model import Model
 import sys
 import os
@@ -8,7 +8,7 @@
 app = flask.Flask(__name__)
 # Pull the output of the DVC stage used to generate the serialized model when running on a
 # deployed server. For example:
-# dvc_pull("train.dvc")
+# dvc.pull("train.dvc")
 model = Model()
 
 @app.route("/predict", methods=["POST"])

diff --git a/whisk/template/{{ cookiecutter.repo_name }}/notebooks/getting_started.ipynb b/whisk/template/{{ cookiecutter.repo_name }}/notebooks/getting_started.ipynb
@@ -104,7 +104,7 @@
    "source": [
     "After training a model you should save it to disk so you can invoke the model later. The method call for saving a model to disk is dependent on your ML framework (for example, Scikit-learn uses pickle while you just call `save` on a PyTorch model).\n",
     "\n",
-    "Regardless of your ML framework, save your model and required artifacts for pre/post-processing to the artifacts directory. You can obtain the path to this directory like this:"
+    "Regardless of your ML framework, save your model and required artifacts for pre/post-processing to the artifacts directory. Saving a model looks this:"
    ]
   },
   {
@@ -113,8 +113,15 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import {{cookiecutter.project_name}}\n",
-    "{{cookiecutter.project_name}}.project.artifacts_dir"
+    "# This example uses pickle to serialize a Python object. \n",
+    "# Use the preferred serialization approach for your ML framework.\n",
+    "import pickle\n",
+    "from whisk.model_stub import ModelStub # A fake model\n",
+    "from {{cookiecutter.project_name}} import project\n",
+    "\n",
+    "model = ModelStub()\n",
+    "file_path = project.artifacts_dir / \"model.pkl\"\n",
+    "pickle.dump(model, open(file_path,\"wb\"))"
    ]
   },
   {
@@ -148,46 +155,6 @@
    "source": [
     "Update `src/{{cookiecutter.project_name}}/models/model.py` to handle loading and pre/post-processing for your own model."
    ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## HTTP Web Service\n",
-    "The `/app` directory contains a Flask app that's ready to serve a model. Start the web service from your terminal: `whisk app start`."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Packaging your model"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "You can share your model with the world as a plain-old Python Package. Just follow the standard Python packaging process. For example, to create a source distribution run the following in your terminal:\n",
-    "\n",
-    "```\n",
-    "python setup.py sdist\n",
-    "```"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "This will create the package with the `dist/` directory."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {

diff --git a/whisk/template/{{ cookiecutter.repo_name }}/scripts/install.py b/whisk/template/{{ cookiecutter.repo_name }}/scripts/install.py
diff --git a/...emplate/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.repo_name }}/artifacts/model.pkl b/...emplate/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.repo_name }}/artifacts/model.pkl
diff --git a/.../template/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.repo_name }}/core/tasks/app.py b/.../template/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.repo_name }}/core/tasks/app.py
@@ -1,5 +1,5 @@
 from invoke import task
-from {{cookiecutter.project_name}}.core.utils import has_unstaged_changes
+import whisk.git as git
 
 @task
 def start(c):
@@ -13,7 +13,7 @@ def create(c,name):
     """
     Create a Heroku app for the web service.
     """
-    if has_unstaged_changes():
+    if git.has_unstaged_changes():
         print("This project has uncommitted changes.\nPlease add and commit the files to the Git repo, then retry:\n\ngit add .\ngit commit -m 'First Commit'")
         exit(1)
     c.run("heroku create -a {}".format(name))

diff --git a/whisk/template/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.repo_name }}/models/model.py b/whisk/template/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.repo_name }}/models/model.py
@@ -1,15 +1,6 @@
-class DummyModel:
-    # DELETE ME - Remove this class when you load a real model. This only
-    # exists so the ModelWrapper works when creating an initial project.
-    """
-    A placeholder for a real ML Model. Returns the number of features in each row.
-
-    Example:
-
-        DummyModel().predict([[1,2],[3,4]]) => [2,2]
-    """
-    def predict(self,X):
-        return list(map(lambda instance: len(instance), X))
+import pickle
+from whisk.model_stub import ModelStub
+import {{cookiecutter.project_name}}
 
 class Model:
     """
@@ -22,19 +13,18 @@ def __init__(self):
         Load the model + required pre-processing artifacts from disk. Loading from disk is slow,
         so this is done in `__init__` rather than loading from disk on every call to `predict`.
 
-        Use paths relative to the project root directory.
-
         Tensorflow example:
 
-            self.model = load_model("models/model.h5")
+            self.model = load_model({{cookiecutter.project_name}}.project.artifacts_dir / "model.h5")
 
         Pickle example:
 
-            with open('models/tokenizer.pickle', 'rb') as handle:
-                self.tokenizer = pickle.load(handle)
+            with open({{cookiecutter.project_name}}.project.artifacts_dir / 'tokenizer.pickle', 'rb') as file:
+                self.tokenizer = pickle.load(file)
         """
         # REPLACE ME - add your loading logic
-        self.model = DummyModel()
+        with open({{cookiecutter.project_name}}.project.artifacts_dir / "model.pkl", 'rb') as file:
+            self.model = pickle.load(file)
 
     def predict(self,data):
         """