Merge branch 'master' into add_mongo

vanvalenlab · Sep 28, 2020 · dc6e2d0 · dc6e2d0
2 parents 2c97d4f + aff807a
commit dc6e2d0
Show file tree

Hide file tree

Showing 103 changed files with 3,621 additions and 350 deletions.
diff --git a/.dockerignore b/.dockerignore
@@ -17,7 +17,6 @@ Dockerfile
 
 ## data files## 
 data/
-example_data/
 
 ### Python ###
 # Byte-compiled / optimized / DLL files

diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,4 @@
-/data
+
 .DS_Store
 ._*
 # Byte-compiled / optimized / DLL files

diff --git a/README.md b/README.md
@@ -3,16 +3,29 @@
 [![Build Status](https://travis-ci.com/vanvalenlab/caliban-toolbox.svg?branch=master)](https://travis-ci.com/vanvalenlab/caliban-toolbox)
 [![Coverage Status](https://coveralls.io/repos/github/vanvalenlab/caliban-toolbox/badge.svg?branch=master)](https://coveralls.io/github/vanvalenlab/caliban-toolbox?branch=master)
 
-DeepCell Toolbox is a collection of data engineering tools for processing, annotating, and packaging optical microscopy images. The framework enables crowdsourced annotations and creates training data for [DeepCell](https://github.com/vanvalenlab/deepcell-tf).
+Caliban Toolbox is a collection of data engineering tools to process and curate crowdsourced image annotations using [Caliban](https://github.com/vanvalenlab/caliban), our data annotation tool. The Toolbox and Caliban work together to generate  annotations for training [DeepCell](https://github.com/vanvalenlab/deepcell-tf).
 
 The process is as follows:
-![flow](./docs/flowchart.png)
 
-Read the documentation at
+1. Raw data is imported using the data loader, which allows the user to select data based on imaging platform, cell type, and marker of interest. 
 
-## Getting Started
+2. The raw data can then be run through deepcell-tf to produce predicted labels.
+
+3. After making predictions with deepcell, the raw data is processed to make it easier for annotators to view. This includes applying filters, adjusting the contrast, etc. Multiple channels can be combined together, including the user-generated modified channels, to create summed channels. Following these modifications, the user selects which of these channels will be included for the annotators to see.
+
+4. The size of the images is then modified to make annotation easier. In order to get high quality annotations, it is important that the images are not so large that the annotators miss errors. Therefore, the images can be cropped into overlapping 2D regions to break up large FOVs. Stacks of images can be further sliced into smaller, more manageable pieces. 
+
+5. Once the image dimensions have been set, each unique crop or slice is saved as an NPZ file. During this process, a JSON file is created which stores the necessary data to reconstruct the original image after annotation.
 
-DeepCell Data Engineering uses `nvidia-docker` and `tensorflow` to enable GPU processing.  
+6. The NPZ files are then uploaded to a cloud bucket, where they can be accesssed by the crowdsource platform. During the upload process, the user specifies an existing job to use a template, which populates the instructions for the annotators and the job settings. A log file is also created with the necessary information to download the annotations once the job is completed.
+
+7. Once the job is completed, the corrected annotations are downloaded from the AWS bucket, where they are stored as the job progresses.
+
+8. These annotations are then stitched back together, and saved as full-size NPZ files to be manually inspected for errors.
+
+9. Following correction, the individual caliban NPZ files are combined together into a single training data NPZ, and saved in the appropriate location in the training data ontology. 
+
+## Getting Started
 
 ### Build a local docker container
 
@@ -23,20 +36,9 @@ docker build -t $USER/caliban_toolbox .
 
 ```
 
-The tensorflow version can be overridden with the build-arg `TF_VERSION`.
-
-```bash
-docker build --build-arg TF_VERSION=1.9.0-gpu -t $USER/caliban_toolbox .
-```
-
 ### Run the new docker image
 
 ```bash
-# NV_GPU refers to the specific GPU to run DeepCell Toolbox on, and is not required
-
-# Mounting the codebase, scripts and data to the container is also optional
-# but can be handy for local development
-
 NV_GPU='0' nvidia-docker run -it \
   -p 8888:8888 \
   $USER/caliban_toolbox:latest
@@ -47,7 +49,7 @@ It can also be helpful to mount the local copy of the repository and the scripts
 ```bash
 NV_GPU='0' nvidia-docker run -it \
   -p 8888:8888 \
-  -v $PWD/caliban_toolbox:/usr/local/lib/python3.5/dist-packages/caliban_toolbox/ \
+  -v $PWD/caliban_toolbox:/usr/local/lib/python3.7/site-packages/caliban_toolbox/ \
   -v $PWD/notebooks:/notebooks \
   -v /data:/data \
   $USER/caliban_toolbox:latest

diff --git a/caliban_toolbox/aws_functions.py b/caliban_toolbox/aws_functions.py
@@ -26,14 +26,10 @@
 import os
 import sys
 import threading
-import re
 
 import boto3
-
-from urllib.parse import urlencode
-from getpass import getpass
-
-from caliban_toolbox.utils.misc_utils import list_npzs_folder
+import botocore
+import getpass
 
 
 # Taken from AWS Documentation
@@ -56,8 +52,8 @@ def __call__(self, bytes_amount):
 
 
 def connect_aws():
-    AWS_ACCESS_KEY_ID = getpass('What is your AWS access key id? ')
-    AWS_SECRET_ACCESS_KEY = getpass('What is your AWS secret access key id? ')
+    AWS_ACCESS_KEY_ID = getpass.getpass('What is your AWS access key id? ')
+    AWS_SECRET_ACCESS_KEY = getpass.getpass('What is your AWS secret access key id? ')
 
     session = boto3.Session(aws_access_key_id=AWS_ACCESS_KEY_ID,
                             aws_secret_access_key=AWS_SECRET_ACCESS_KEY)
@@ -67,67 +63,46 @@ def connect_aws():
     return s3
 
 
-def aws_upload_files(aws_folder, stage, upload_folder, pixel_only, label_only, rgb_mode):
+def aws_upload_files(local_paths, aws_paths):
     """Uploads files to AWS bucket for use in Figure 8
 
     Args:
-        aws_folder: folder where uploaded files will be stored
-        stage: specifies stage in pipeline for jobs requiring multiple rounds of annotation
-        upload_folder: path to folder containing files that will be uploaded
-        pixel_only: boolean flag to set pixel_only mode
-        label_only: boolean flag to set label_only mode
-        rgb_mode: boolean flag to set rgb_mode
+        local_paths: list of paths to npz files
+        aws_paths: list of paths for saving npz files in AWS
     """
 
     s3 = connect_aws()
 
-    # load the images from specified folder but not the json log file
-    files_to_upload = list_npzs_folder(upload_folder)
-
-    filename_list = []
-
-    # change slashes separating nested folders to underscores for URL generation
-    subfolders = re.split('/', aws_folder)
-    subfolders = '__'.join(subfolders)
-
-    url_dict = {'pixel_only': pixel_only, 'label_only': label_only, 'rgb': rgb_mode}
-    url_encoded_dict = urlencode(url_dict)
-
     # upload images
-    for img in files_to_upload:
-
-        # full path to image
-        img_path = os.path.join(upload_folder, img)
-
-        # destination path
-        img_key = os.path.join(aws_folder, stage, img)
-
-        # upload
-        s3.upload_file(img_path, 'caliban-input', img_key, Callback=ProgressPercentage(img_path),
-                       ExtraArgs={'ACL': 'public-read', 'Metadata': {'source_path': img_path}})
+    for i in range(len(local_paths)):
+        s3.upload_file(Filename=local_paths[i], Bucket='caliban-input', Key=aws_paths[i],
+                       Callback=ProgressPercentage(local_paths[i]),
+                       ExtraArgs={'ACL': 'public-read',
+                                  'Metadata': {'source_path': local_paths[i]}})
         print('\n')
 
-        url = 'https://caliban.deepcell.org/{}__{}__{}__' \
-              '{}__{}?{}'.format('caliban-input', 'caliban-output', subfolders, stage, img,
-                                 url_encoded_dict)
-
-        # add caliban url to list
-        filename_list.append(url)
 
-    return files_to_upload, filename_list
+def aws_copy_files(current_folder, next_folder, filenames):
+    """Copy files from one AWS bucket to another.
 
+    Args:
+        current_folder: aws folder with current files
+        next_folder: aws folder where files will be copied
+        filenames: list of NPZ files to copy
+    """
 
-def aws_transfer_file(s3, input_bucket, output_bucket, key_src, key_dst):
-    """Helper function to transfer files from one bucket/key to another. Used
-    in conjunction with a soon-to-be-created transfer jobs script for jobs with multiple stages"""
+    s3 = connect_aws()
 
-    copy_source = {'Bucket': output_bucket,
-                   'Key': key_src}
+    for file in filenames:
+        copy_source = {'Bucket': 'caliban-output',
+                       'Key': os.path.join(current_folder, file)}
 
-    s3.copy(copy_source, input_bucket, key_dst,
-            ExtraArgs={'ACL': 'public-read'})
+        s3.copy(CopySource=copy_source, Bucket='caliban-input',
+                Key=os.path.join(next_folder, file),
+                ExtraArgs={'ACL': 'public-read'})
 
 
+# TODO: catch missing files
 def aws_download_files(upload_log, output_dir):
     """Download files following Figure 8 annotation.
 
@@ -143,13 +118,27 @@ def aws_download_files(upload_log, output_dir):
     aws_folder = upload_log['aws_folder'][0]
     stage = upload_log['stage'][0]
 
+    # track missing files
+    missing = []
+
     # download all images
-    for img in files_to_download:
+    for file in files_to_download:
 
         # full path to save image
-        save_path = os.path.join(output_dir, img)
+        local_path = os.path.join(output_dir, file)
 
         # path to file in aws
-        img_path = os.path.join(aws_folder, stage, img)
+        aws_path = os.path.join(aws_folder, stage, file)
+
+        try:
+            s3.download_file(Bucket='caliban-output', Key=aws_path, Filename=local_path)
+        except botocore.exceptions.ClientError as e:
+            error_code = e.response['Error']['Code']
+
+            if error_code == '404':
+                print('The file {} does not exist'.format(aws_path))
+                missing.append(aws_path)
+            else:
+                raise e
 
-        s3.download_file(Bucket='caliban-output', Key=img_path, Filename=save_path)
+    return missing
diff --git a/caliban_toolbox/aws_functions_test.py b/caliban_toolbox/aws_functions_test.py
@@ -0,0 +1,98 @@
+# Copyright 2016-2020 The Van Valen Lab at the California Institute of
+# Technology (Caltech), with support from the Paul Allen Family Foundation,
+# Google, & National Institutes of Health (NIH) under Grant U24CA224309-01.
+# All rights reserved.
+#
+# Licensed under a modified Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.github.com/vanvalenlab/caliban-toolbox/LICENSE
+#
+# The Work provided may be used for non-commercial academic purposes only.
+# For any other use of the Work, including commercial use, please contact:
+# vanvalenlab@gmail.com
+#
+# Neither the name of Caltech nor the names of its contributors may be used
+# to endorse or promote products derived from this software without specific
+# prior written permission.
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+import os
+import boto3
+import botocore
+import pytest
+
+from caliban_toolbox import aws_functions
+import pathlib
+
+
+class FakeS3(object):
+
+    def __init__(self, aws_access_key_id='key', aws_secret_access_key='secret', raise_error=None):
+        self.raise_error = raise_error
+
+    def client(self, *_, **__):
+        return self
+
+    def upload_file(self, Filename, Bucket, Key, Callback, ExtraArgs):
+        assert os.path.exists(Filename)
+
+    def download_file(self, Bucket, Key, Filename):
+        if self.raise_error is None:
+            pathlib.Path(Filename).touch()
+        elif self.raise_error is 'missing':
+            raise botocore.exceptions.ClientError(error_response={'Error': {'Code': '404'}},
+                                                  operation_name='missing_file')
+        elif self.raise_error is 'other':
+            raise botocore.exceptions.ClientError(error_response={'Error': {'Code': '555'}},
+                                                  operation_name='some_other_error')
+
+
+# TODO: Can we spy on this function in order to have some sort of correctness test here?
+def test_aws_upload_files(mocker, tmp_path):
+    mocker.patch('getpass.getpass', lambda *x: None)
+    mocker.patch('boto3.Session', FakeS3)
+
+    local_files = ['npz_file_' + str(num) for num in range(5)]
+    aws_paths = ['aws_bucket/folder/npz_file_' + str(num) for num in range(5)]
+
+    for file in local_files:
+        pathlib.Path(os.path.join(tmp_path, file)).touch()
+
+    local_paths = [os.path.join(tmp_path, file) for file in local_files]
+
+    aws_functions.aws_upload_files(local_paths=local_paths, aws_paths=aws_paths)
+
+
+def test_aws_download_files(mocker, tmp_path):
+    mocker.patch('getpass.getpass', lambda *x: None)
+    mocker.patch('boto3.Session', FakeS3)
+
+    filenames = ['npz_file_' + str(num) for num in range(5)]
+
+    upload_log = {'stage': ['stage_0'],
+                  'aws_folder': ['temp_folder'],
+                  'filename': filenames}
+
+    # no missing files
+    missing = aws_functions.aws_download_files(upload_log=upload_log, output_dir=tmp_path)
+    assert missing == []
+
+    # catch missing file error, return list of missing files
+    mocker.patch('boto3.Session',
+                 lambda aws_access_key_id, aws_secret_access_key: FakeS3(raise_error='missing'))
+    missing = aws_functions.aws_download_files(upload_log=upload_log, output_dir=tmp_path)
+    missing = [os.path.split(file_path)[1] for file_path in missing]
+    assert missing == filenames
+
+    # all other errors not caught
+    with pytest.raises(botocore.exceptions.ClientError):
+        mocker.patch('boto3.Session',
+                     lambda aws_access_key_id, aws_secret_access_key: FakeS3(raise_error='other'))
+        missing = aws_functions.aws_download_files(upload_log=upload_log, output_dir=tmp_path)