diff --git a/.gitignore b/.gitignore index 27ad5c6..f2c0f99 100644 --- a/.gitignore +++ b/.gitignore @@ -110,4 +110,4 @@ ENV/ */.ipynb_checkpoints/ examples/head.stock examples/.gitignore -docs/*.ipynb \ No newline at end of file +docs/*.ipynb diff --git a/examples/requirements.txt b/examples/requirements.txt index 7ead59f..95e3318 100644 --- a/examples/requirements.txt +++ b/examples/requirements.txt @@ -2,5 +2,5 @@ jupyter notebook jupyterlab stockroom -google_images_download +python-mnist PILLOW diff --git a/examples/with-git.ipynb b/examples/with-git.ipynb index fa8dee6..83bb005 100644 --- a/examples/with-git.ipynb +++ b/examples/with-git.ipynb @@ -14,11 +14,8 @@ "For this tutorial, we use a pretrained PyTorch network to classify cats and dogs. We have divided the whole tutorial into 7 stages.\n", "1. Setup the repository\n", "2. Download some data and store it in stockroom\n", - "3. Download the pretrained model and save it in stockroom\n", - "4. Train the network and save the model + hyper parameters\n", - "5. Download more data if the accuracy is less\n", - "6. Train the network again with new data. Save the new network & hyper parameters\n", - "7. Fine tune the hyper parameters" + "3. Train the network and save the model + hyper parameters\n", + "4. Fine tune the hyper parameters" ] }, { @@ -33,14 +30,14 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Reinitialized existing Git repository in /home/hhsecond/mypro/stockroom/examples/.git/\n" + "Initialized empty Git repository in /home/hhsecond/mypro/stockroom/examples/.git/\n" ] } ], @@ -58,14 +55,15 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Hangar Repo initialized at: /home/hhsecond/mypro/stockroom/examples/.hangar\n" + "Hangar Repo initialized at: /home/hhsecond/mypro/stockroom/examples/.hangar\n", + "Stock file created\n" ] } ], @@ -83,7 +81,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -91,11 +89,15 @@ "output_type": "stream", "text": [ "On branch master\n", + "\n", + "No commits yet\n", + "\n", "Untracked files:\n", " (use \"git add ...\" to include in what will be committed)\n", "\n", + "\t\u001b[31m.gitignore\u001b[m\n", "\t\u001b[31m.ipynb_checkpoints/\u001b[m\n", - "\t\u001b[31mdownloads/\u001b[m\n", + "\t\u001b[31mhead.stock\u001b[m\n", "\t\u001b[31mrequirements.txt\u001b[m\n", "\t\u001b[31mwith-git.ipynb\u001b[m\n", "\n", @@ -109,15 +111,17 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "[master 66b4aee] initialized repo\n", - " 1 file changed, 1 insertion(+), 2 deletions(-)\n" + "[master (root-commit) bfe72a8] initialized repo\n", + " 2 files changed, 2 insertions(+)\n", + " create mode 100644 .gitignore\n", + " create mode 100644 head.stock\n" ] } ], @@ -132,168 +136,452 @@ "metadata": {}, "source": [ "## 2. Download & Store Data\n", - "As you know, we'll be building a dog vs cat classifier. Hence we need a lot of images of dogs and cats. Let's use a readily available package (`google_images_download`) to download images from google and store it in hangar\n", + "For this tutorial, as most of the tutorials, we'll build a fully connected network to predict hand written digits from MNIST dataset.\n", "\n", "#### Download images\n", - "This might throw error on some attempts but may be keep trying few times" + "We download the data using below utility functions (inspired from https://gist.github.com/goldsborough/6dd52a5e01ed73a642c1e772084bcd03)" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "from urllib.request import urlretrieve\n", + "import gzip\n", + "import os\n", + "import sys\n", + "\n", + "\n", + "def report_download_progress(chunk_number, chunk_size, file_size):\n", + " if file_size != -1:\n", + " percent = min(1, (chunk_number * chunk_size) / file_size)\n", + " bar = '#' * int(64 * percent)\n", + " sys.stdout.write('\\r0% |{:<64}| {}%'.format(bar, int(percent * 100)))\n", + "\n", + "\n", + "def download(destination_path, url):\n", + " if os.path.exists(destination_path):\n", + " print('{} already exists, skipping ...'.format(destination_path))\n", + " else:\n", + " print('Downloading {} ...'.format(url))\n", + " urlretrieve(url, destination_path, reporthook=report_download_progress)\n", + "\n", + "def unzip(zipped_path):\n", + " unzipped_path = os.path.splitext(zipped_path)[0]\n", + " if os.path.exists(unzipped_path):\n", + " print('{} already exists, skipping ... '.format(unzipped_path))\n", + " return\n", + " with gzip.open(zipped_path, 'rb') as zipped_file:\n", + " with open(unzipped_path, 'wb') as unzipped_file:\n", + " unzipped_file.write(zipped_file.read())\n", + " print('\\nUnzipped {} ...'.format(zipped_path))" + ] + }, + { + "cell_type": "code", + "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "\n", - "Item no.: 1 --> Item name = cat\n", - "Evaluating...\n", - "Starting Download...\n", - "\n", - "\n", - "Unfortunately all 100 could not be downloaded because some images were not downloadable. 0 is all we got for this search filter!\n", - "\n", - "Errors: 0\n", - "\n", - "\n", - "Item no.: 1 --> Item name = dog\n", - "Evaluating...\n", - "Starting Download...\n", - "\n", - "\n", - "Unfortunately all 100 could not be downloaded because some images were not downloadable. 0 is all we got for this search filter!\n", - "\n", - "Errors: 0\n", - "\n" + "downloads/train-images-idx3-ubyte.gz already exists, skipping ...\n", + "downloads/train-images-idx3-ubyte already exists, skipping ... \n", + "downloads/train-labels-idx1-ubyte.gz already exists, skipping ...\n", + "downloads/train-labels-idx1-ubyte already exists, skipping ... \n", + "downloads/t10k-images-idx3-ubyte.gz already exists, skipping ...\n", + "downloads/t10k-images-idx3-ubyte already exists, skipping ... \n", + "downloads/t10k-labels-idx1-ubyte.gz already exists, skipping ...\n", + "downloads/t10k-labels-idx1-ubyte already exists, skipping ... \n" + ] + } + ], + "source": [ + "from pathlib import Path\n", + "\n", + "RESOURCES = [\n", + " 'train-images-idx3-ubyte.gz',\n", + " 'train-labels-idx1-ubyte.gz',\n", + " 't10k-images-idx3-ubyte.gz',\n", + " 't10k-labels-idx1-ubyte.gz',\n", + "]\n", + "\n", + "path = Path('downloads')\n", + "path.mkdir(exist_ok=True)\n", + "\n", + "for resource in RESOURCES:\n", + " destination = os.path.join(str(path), resource)\n", + " url = 'http://yann.lecun.com/exdb/mnist/{}'.format(resource)\n", + " download(destination, url)\n", + " unzip(destination)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Store to StockRoom\n", + "We need hangar columns ready for stockroom to store data there. " + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Initialized Arrayset: image\n", + "Initialized Arrayset: label\n", + "Commit message:\n", + "arrayset initialized\n", + "Commit Successful. Digest: a=28a09ff56d69697bc313561b362200ae94b389d5\n" + ] + } + ], + "source": [ + "!hangar arrayset create image INT64 784\n", + "!hangar arrayset create label INT64 1\n", + "!stock commit -m 'arrayset initialized'" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "from mnist import MNIST\n", + "mndata = MNIST(path)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "images, labels = mndata.load_training()\n", + "tmpimages, tmplabels = mndata.load_testing()\n", + "images.extend(tmpimages)\n", + "labels.extend(tmplabels)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "from stockroom import StockRoom\n", + "stock = StockRoom()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " * Checking out COMMIT: a=28a09ff56d69697bc313561b362200ae94b389d5\n" ] }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 70000/70000 [00:28<00:00, 2433.96it/s]\n" + ] + } + ], + "source": [ + "from tqdm import tqdm\n", + "import numpy as np\n", + "\n", + "with stock.optimize(write=True):\n", + " for i in tqdm(range(len(images))):\n", + " img = np.array(images[i])\n", + " label = np.array(labels[i]).reshape(1)\n", + " stock.data['image', i] = img\n", + " stock.data['label', i] = label" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Commit message:\n", + "added data\n", + "Commit Successful. Digest: a=d6b2e5d8bbc397eda5448b3eadc0dc39e14c123e\n" + ] + } + ], + "source": [ + "!stock commit -m 'added data'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Network training\n", + "Let's build a simple fully connected network in PyTorch" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "from tqdm import tqdm\n", + "import torch\n", + "from stockroom import StockRoom\n", + "\n", + "def train(model, optimizer, criterion):\n", + " stock = StockRoom()\n", + "\n", + " with stock.optimize():\n", + " for epoch in range(stock.tag['epoch']):\n", + " running_loss = 0\n", + " trange = tqdm(range(70000))\n", + " for i in trange:\n", + " optimizer.zero_grad()\n", + " sample = torch.from_numpy(stock.data['image', i]).float()\n", + " sample /= 255\n", + " out = model(sample).unsqueeze(0)\n", + " label = torch.from_numpy(stock.data['label', i])\n", + " loss = criterion(out, label)\n", + " running_loss += loss.item()\n", + " loss.backward()\n", + " optimizer.step()\n", + " if i % 1000 == 0 and i != 0:\n", + " trange.set_description(str(running_loss / i))\n", + " stock.model['mnist'] = model.state_dict()\n", + " stock.commit('added model')" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ { "data": { "text/plain": [ - "({'dog': []}, 0)" + "Sequential(\n", + " (0): Linear(in_features=784, out_features=32, bias=True)\n", + " (1): ReLU()\n", + " (2): Linear(in_features=32, out_features=16, bias=True)\n", + " (3): ReLU()\n", + " (4): Linear(in_features=16, out_features=10, bias=True)\n", + " (5): LogSoftmax()\n", + ")" ] }, - "execution_count": 13, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "from google_images_download import google_images_download\n", - "downloader = google_images_download.googleimagesdownload()\n", - "arguments = {\n", - " \"format\": \"jpg\",\n", - " \"limit\":100,\n", - " \"print_urls\":True,\n", - " \"size\": \"medium\",\n", - " \"aspect_ratio\": \"panoramic\"\n", - "}\n", + "import torch.nn as nn\n", "\n", - "# downloading cat\n", - "arguments['keywords'] = 'cat'\n", - "downloader.download(arguments)\n", + "stock.tag['lr'] = 0.01\n", + "stock.tag['momentum'] = 0.5\n", + "stock.tag['epoch'] = 2\n", + "stock.commit('hyper params')\n", "\n", + "input_size = 784\n", + "hidden_sizes = [32, 16]\n", + "output_size = 10\n", "\n", - "# downloading dog\n", - "arguments['keywords'] = 'dog'\n", - "downloader.download(arguments)" + "model = nn.Sequential(\n", + " nn.Linear(input_size, hidden_sizes[0]),\n", + " nn.ReLU(),\n", + " nn.Linear(hidden_sizes[0], hidden_sizes[1]),\n", + " nn.ReLU(),\n", + " nn.Linear(hidden_sizes[1], output_size),\n", + " nn.LogSoftmax())" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 16, "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " * Checking out COMMIT: a=5c291a0b2d946e3bfa359f754837a112df575bd6\n", + " * Checking out COMMIT: a=5c291a0b2d946e3bfa359f754837a112df575bd6\n" + ] + } + ], "source": [ - "#### Store to StockRoom\n", - "We need hangar columns ready for stockroom to store data there. " + "from torch import optim\n", + "\n", + "optimizer = optim.SGD(model.parameters(), lr=stock.tag['lr'], momentum=stock.tag['momentum'])\n", + "criterion = nn.NLLLoss()" ] }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 17, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " * Checking out COMMIT: a=5c291a0b2d946e3bfa359f754837a112df575bd6\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 0%| | 0/70000 [00:00