Merge 27bc05a into df42d68

vanvalenlab · Aug 18, 2020 · 2468342 · 2468342
2 parents df42d68 + 27bc05a
commit 2468342
Show file tree

Hide file tree

Showing 9 changed files with 484 additions and 352 deletions.
diff --git a/caliban_toolbox/__init__.py b/caliban_toolbox/__init__.py
@@ -34,7 +34,6 @@
 from caliban_toolbox import utils
 from caliban_toolbox import aws_functions
 from caliban_toolbox import figure_eight_functions
-from caliban_toolbox import log_file
 from caliban_toolbox import relabel
 from caliban_toolbox import reshape_data
 

diff --git a/caliban_toolbox/anolytics.py b/caliban_toolbox/anolytics.py
@@ -0,0 +1,144 @@
+# Copyright 2016-2020 The Van Valen Lab at the California Institute of
+# Technology (Caltech), with support from the Paul Allen Family Foundation,
+# Google, & National Institutes of Health (NIH) under Grant U24CA224309-01.
+# All rights reserved.
+#
+# Licensed under a modified Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.github.com/vanvalenlab/caliban-toolbox/LICENSE
+#
+# The Work provided may be used for non-commercial academic purposes only.
+# For any other use of the Work, including commercial use, please contact:
+# vanvalenlab@gmail.com
+#
+# Neither the name of Caltech nor the names of its contributors may be used
+# to endorse or promote products derived from this software without specific
+# prior written permission.
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+import os
+
+import pandas as pd
+
+from caliban_toolbox import crowdsource
+from caliban_toolbox.utils.misc_utils import list_npzs_folder
+from caliban_toolbox.aws_functions import aws_upload_files, aws_download_files
+
+
+def create_upload_log(base_dir, stage, aws_folder, filenames, filepaths, log_name,
+                      pixel_only=False, label_only=False, rgb_mode=False):
+    """Generates a csv log of parameters used during job creation for subsequent use in pipeline.
+
+    Args:
+        base_dir: full path to directory where job results will be stored
+        stage: specifies stage in pipeline for jobs requiring multiple rounds of annotation
+        aws_folder: folder in aws bucket where files be stored
+        filenames: list of all files to be uploaded
+        filepaths: list of complete urls to images in Amazon S3 bucket
+        log_name: name for log file
+        pixel_only: flag specifying whether annotators will be restricted to pixel edit mode
+        label_only: flag specifying whether annotators will be restricted to label edit mode
+        rgb_mode: flag specifying whether annotators will view images in RGB mode
+    """
+
+    data = {'project_url': filepaths,
+            'filename': filenames,
+            'stage': stage,
+            'aws_folder': aws_folder,
+            'pixel_only': pixel_only,
+            'label_only': label_only,
+            'rgb_mode': rgb_mode}
+    dataframe = pd.DataFrame(data=data, index=range(len(filepaths)))
+
+    # create file location, name file
+    log_dir = os.path.join(base_dir, 'logs')
+    if not os.path.isdir(log_dir):
+        os.makedirs(log_dir)
+
+    # save csv file
+    dataframe.to_csv(os.path.join(log_dir, log_name), index=False)
+
+    # create csv containing only URLs to upload to anolytics website
+    small_df = pd.DataFrame({'project_url': filepaths})
+    small_df.to_csv(os.path.join(log_dir, 'small_' + log_name))
+
+
+def create_anolytics_job(base_dir, aws_folder, stage, rgb_mode=False, label_only=False,
+                         pixel_only=False):
+    """Create a log file and upload NPZs to aws for an anolytics job.
+
+    Args:
+        base_dir: full path to job directory
+        aws_folder: folder in aws bucket where files be stored
+        stage: specifies stage in pipeline for jobs requiring multiple rounds of annotation
+        pixel_only: flag specifying whether annotators will be restricted to pixel edit mode
+        label_only: flag specifying whether annotators will be restricted to label edit mode
+        rgb_mode: flag specifying whether annotators will view images in RGB mode
+
+    Raises:
+        ValueError: If invalid base_dir supplied
+        ValueError: If no crop directory found within base_dir
+        ValueError: If no NPZs found in crop directory
+    """
+
+    if not os.path.isdir(base_dir):
+        raise ValueError('Invalid directory name')
+
+    upload_folder = os.path.join(base_dir, 'crop_dir')
+
+    if not os.path.isdir(upload_folder):
+        raise ValueError('No crop directory found within base directory')
+
+    if len(list_npzs_folder(upload_folder)) == 0:
+        raise ValueError('No NPZs found in crop dir')
+
+    # get relevant paths
+    npz_paths, npz_keys, url_paths, npzs = crowdsource.create_job_urls(crop_dir=upload_folder,
+                                                                       aws_folder=aws_folder,
+                                                                       stage=stage,
+                                                                       pixel_only=pixel_only,
+                                                                       label_only=label_only,
+                                                                       rgb_mode=rgb_mode)
+
+    # upload files to AWS bucket
+    aws_upload_files(local_paths=npz_paths, aws_paths=npz_keys)
+
+    log_name = 'stage_0_{}_upload_log.csv'.format(stage)
+
+    # Generate log file for current job
+    create_upload_log(base_dir=base_dir, stage=stage, aws_folder=aws_folder,
+                      filenames=npzs, filepaths=url_paths,
+                      pixel_only=pixel_only, rgb_mode=rgb_mode, label_only=label_only,
+                      log_name=log_name)
+
+
+def download_anolytics_output(base_dir):
+    """Gets annotated files from an anolytics job
+
+    Args:
+        base_dir: directory containing relevant job files
+
+    Returns:
+        list: file names of NPZs not found in AWS bucket
+    """
+
+    # get information from job creation
+    log_dir = os.path.join(base_dir, 'logs')
+    latest_log = crowdsource.get_latest_log_file(log_dir)
+    log_file = pd.read_csv(os.path.join(log_dir, latest_log))
+
+    # download annotations from aws
+    output_dir = os.path.join(base_dir, 'output')
+    if not os.path.isdir(output_dir):
+        os.makedirs(output_dir)
+
+    missing = aws_download_files(log_file, output_dir)
+
+    return missing
diff --git a/caliban_toolbox/anolytics_test.py b/caliban_toolbox/anolytics_test.py
@@ -0,0 +1,75 @@
+# Copyright 2016-2020 The Van Valen Lab at the California Institute of
+# Technology (Caltech), with support from the Paul Allen Family Foundation,
+# Google, & National Institutes of Health (NIH) under Grant U24CA224309-01.
+# All rights reserved.
+#
+# Licensed under a modified Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.github.com/vanvalenlab/caliban-toolbox/LICENSE
+#
+# The Work provided may be used for non-commercial academic purposes only.
+# For any other use of the Work, including commercial use, please contact:
+# vanvalenlab@gmail.com
+#
+# Neither the name of Caltech nor the names of its contributors may be used
+# to endorse or promote products derived from this software without specific
+# prior written permission.
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+import os
+
+import numpy as np
+import pandas as pd
+
+from caliban_toolbox import anolytics
+from caliban_toolbox.aws_functions_test import FakeS3
+
+
+def test_create_anolytics_job(mocker, tmp_path):
+    mocker.patch('getpass.getpass', lambda *x: 'test_api_key')
+    mocker.patch('boto3.Session', FakeS3)
+
+    # create crop directory
+    crop_dir = os.path.join(tmp_path, 'crop_dir')
+    os.makedirs(crop_dir)
+    np.savez(os.path.join(crop_dir, 'test_crop.npz'))
+
+    anolytics.create_anolytics_job(base_dir=tmp_path,
+                                   aws_folder='aws',
+                                   stage='stage')
+
+    test_log_name = os.path.join(tmp_path, 'logs', 'stage_0_{}_upload_log.csv'.format('stage'))
+    assert os.path.exists(test_log_name)
+
+
+def test_download_anolytics_output(mocker, tmp_path):
+    mocker.patch('getpass.getpass', lambda *x: 'test_api_key')
+    mocker.patch('boto3.Session', FakeS3)
+
+    # create logs directory with upload log
+    os.makedirs(os.path.join(tmp_path, 'logs'))
+    log_dict = {'filename': ['example_1.npz', 'example_2.npz'],
+                'aws_folder': ['example_folder', 'example_folder'],
+                'stage': ['stage_0', 'stage_0']
+                }
+
+    log_file = pd.DataFrame(log_dict)
+
+    log_file.to_csv(os.path.join(tmp_path, 'logs', 'stage_0_upload_log.csv'))
+
+    missing = anolytics.download_anolytics_output(tmp_path)
+    assert missing == []
+
+    # catch missing file error, return list of missing files
+    mocker.patch('boto3.Session',
+                 lambda aws_access_key_id, aws_secret_access_key: FakeS3(raise_error='missing'))
+    missing = anolytics.download_anolytics_output(tmp_path)
+    missing = [os.path.split(file_path)[1] for file_path in missing]
+    assert missing == log_dict['filename']
diff --git a/caliban_toolbox/crowdsource.py b/caliban_toolbox/crowdsource.py
@@ -0,0 +1,105 @@
+# Copyright 2016-2020 The Van Valen Lab at the California Institute of
+# Technology (Caltech), with support from the Paul Allen Family Foundation,
+# Google, & National Institutes of Health (NIH) under Grant U24CA224309-01.
+# All rights reserved.
+#
+# Licensed under a modified Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.github.com/vanvalenlab/caliban-toolbox/LICENSE
+#
+# The Work provided may be used for non-commercial academic purposes only.
+# For any other use of the Work, including commercial use, please contact:
+# vanvalenlab@gmail.com
+#
+# Neither the name of Caltech nor the names of its contributors may be used
+# to endorse or promote products derived from this software without specific
+# prior written permission.
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import os
+import re
+
+from urllib.parse import urlencode
+
+from caliban_toolbox.utils.misc_utils import list_npzs_folder
+
+
+def _format_url(aws_folder, stage, npz, url_encoded_dict):
+    base_url = 'https://caliban.deepcell.org/caliban-input__caliban-output__{}__{}__{}?{}'
+    formatted_url = base_url.format(aws_folder, stage, npz, url_encoded_dict)
+
+    return formatted_url
+
+
+def _create_next_log_name(previous_log_name, stage):
+    stage_num = int(previous_log_name.split('_')[1])
+    new_log = 'stage_{}_{}_upload_log.csv'.format(stage_num + 1, stage)
+
+    return new_log
+
+
+def get_latest_log_file(log_dir):
+    """Find the latest log file in the log directory
+
+    Args:
+        log_dir: full path to log directory
+
+    Returns:
+        string: name of the latest log file
+    """
+    files = os.listdir(log_dir)
+    log_files = [file for file in files if 'upload_log.csv' in file]
+    log_files.sort()
+
+    return log_files[-1]
+
+
+def create_job_urls(crop_dir, aws_folder, stage, pixel_only, label_only, rgb_mode):
+    """Helper function to create relevant URLs for caliban log and AWS upload
+
+    Args:
+        crop_dir: full path to directory with the npz crops
+        aws_folder: path for images to be stored in AWS
+        stage: which stage of the correction process this job is for
+        pixel_only: boolean flag to determine if only pixel mode is available
+        label_only: boolean flag to determine if only label is available
+        rgb_mode: boolean flag to determine if rgb mode will be enabled
+
+    Returns:
+        list: list of paths to local NPZs to be uploaded
+        list: list of paths to desintation for NPZs
+        list: list of URLs to supply to figure8 to to display crops
+        list: list of NPZs that will be uploaded
+
+    Raises:
+        ValueError: If URLs are not valid
+    """
+    # TODO: check that URLS don't contain invalid character
+    # load the images from specified folder but not the json log file
+    npzs_to_upload = list_npzs_folder(crop_dir)
+
+    # change slashes separating nested folders to underscores for URL generation
+    subfolders = re.split('/', aws_folder)
+    subfolders = '__'.join(subfolders)
+
+    # create dictionary to hold boolean flags
+    url_dict = {'pixel_only': pixel_only, 'label_only': label_only, 'rgb': rgb_mode}
+    url_encoded_dict = urlencode(url_dict)
+
+    # create path to npz, key to upload npz, and url path for figure8
+    npz_paths, npz_keys, url_paths = [], [], []
+    for npz in npzs_to_upload:
+        npz_paths.append(os.path.join(crop_dir, npz))
+        npz_keys.append(os.path.join(aws_folder, stage, npz))
+        url_paths.append(_format_url(subfolders, stage, npz, url_encoded_dict))
+
+    # TODO: think about better way to structure than many lists
+    return npz_paths, npz_keys, url_paths, npzs_to_upload