Merge 06cc0e4 into 23d9e2a

vanvalenlab · Jun 7, 2020 · e9d9e8e · e9d9e8e
2 parents 23d9e2a + 06cc0e4
commit e9d9e8e
Show file tree

Hide file tree

Showing 8 changed files with 126 additions and 176 deletions.
diff --git a/caliban_toolbox/aws_functions.py b/caliban_toolbox/aws_functions.py
@@ -70,9 +70,9 @@ def connect_aws():
 def aws_upload_files(local_paths, aws_paths):
     """Uploads files to AWS bucket for use in Figure 8
 
-        Args:
-            local_paths: list of paths to npz files
-            aws_paths: list of paths for saving npz files in AWS
+    Args:
+        local_paths: list of paths to npz files
+        aws_paths: list of paths for saving npz files in AWS
     """
 
     s3 = connect_aws()
@@ -86,17 +86,27 @@ def aws_upload_files(local_paths, aws_paths):
         print('\n')
 
 
-def aws_transfer_file(s3, input_bucket, output_bucket, key_src, key_dst):
-    """Helper function to transfer files from one bucket/key to another. Used
-    in conjunction with a soon-to-be-created transfer jobs script for jobs with multiple stages"""
+def aws_copy_files(current_folder, next_folder, filenames):
+    """Copy files from one AWS bucket to another.
+
+    Args:
+        current_folder: aws folder with current files
+        next_folder: aws folder where files will be copied
+        filenames: list of NPZ files to copy
+    """
+
+    s3 = connect_aws()
 
-    copy_source = {'Bucket': output_bucket,
-                   'Key': key_src}
+    for file in filenames:
+        copy_source = {'Bucket': 'caliban-output',
+                       'Key': os.path.join(current_folder, file)}
 
-    s3.copy(copy_source, input_bucket, key_dst,
-            ExtraArgs={'ACL': 'public-read'})
+        s3.copy(CopySource=copy_source, Bucket='caliban-input',
+                Key=os.path.join(next_folder, file),
+                ExtraArgs={'ACL': 'public-read'})
 
 
+# TODO: catch missing files
 def aws_download_files(upload_log, output_dir):
     """Download files following Figure 8 annotation.
 

diff --git a/caliban_toolbox/figure_eight_functions.py b/caliban_toolbox/figure_eight_functions.py
@@ -23,6 +23,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+import json
 import requests
 import os
 import stat
@@ -35,7 +36,7 @@
 from urllib.parse import urlencode
 
 from caliban_toolbox.log_file import create_upload_log
-from caliban_toolbox.aws_functions import aws_upload_files, aws_download_files
+from caliban_toolbox.aws_functions import aws_upload_files, aws_copy_files, aws_download_files
 from caliban_toolbox.utils.misc_utils import list_npzs_folder
 
 
@@ -47,16 +48,18 @@ def _format_url(aws_folder, stage, npz, url_encoded_dict):
 
 
 def _create_next_log_name(previous_log_name, stage):
-    stage_num = previous_log_name.split('_')[1]
+    stage_num = int(previous_log_name.split('_')[1])
     new_log = 'stage_{}_{}_upload_log.csv'.format(stage_num + 1, stage)
 
     return new_log
 
 
 def get_latest_log_file(log_dir):
     """Find the latest log file in the log directory
+
     Args:
         log_dir: full path to log directory
+
     Returns:
         string: name of the latest log file
     """
@@ -69,18 +72,21 @@ def get_latest_log_file(log_dir):
 
 def create_job_urls(crop_dir, aws_folder, stage, pixel_only, label_only, rgb_mode):
     """Helper function to create relevant URLs for caliban log and AWS upload
+
     Args:
         crop_dir: full path to directory with the npz crops
         aws_folder: path for images to be stored in AWS
         stage: which stage of the correction process this job is for
         pixel_only: boolean flag to determine if only pixel mode is available
         label_only: boolean flag to determine if only label is available
         rgb_mode: boolean flag to determine if rgb mode will be enabled
+
     Returns:
         list: list of paths to local NPZs to be uploaded
         list: list of paths to desintation for NPZs
         list: list of URLs to supply to figure8 to to display crops
         list: list of NPZs that will be uploaded
+
     Raises:
         ValueError: If URLs are not valid
     """
@@ -103,7 +109,7 @@ def create_job_urls(crop_dir, aws_folder, stage, pixel_only, label_only, rgb_mod
         npz_keys.append(os.path.join(aws_folder, stage, npz))
         url_paths.append(_format_url(subfolders, stage, npz, url_encoded_dict))
 
-    # TODO: think about better way to structure than than many lists
+    # TODO: think about better way to structure than many lists
     return npz_paths, npz_keys, url_paths, npzs_to_upload
 
 
@@ -144,9 +150,6 @@ def upload_log_file(log_file, job_id, key):
     url_encoded_dict = urllib.parse.urlencode(url_dict)
     url = url.format(job_id, url_encoded_dict)
 
-    csv_file = open(csv_path, 'r')
-    csv_data = csv_file.read()
-
     headers = {"Content-Type": "text/csv"}
     add_data = requests.put(url, data=log_file, headers=headers)
 
@@ -156,13 +159,14 @@ def upload_log_file(log_file, job_id, key):
         print("Data successfully uploaded to Figure Eight.")
 
 
-def create_figure_eight_job(base_dir, job_id_to_copy, aws_folder, stage,
+def create_figure_eight_job(base_dir, job_id_to_copy, job_name, aws_folder, stage,
                             rgb_mode=False, label_only=False, pixel_only=False):
     """Create a Figure 8 job and upload data to it. New job ID printed out for convenience.
 
     Args:
         base_dir: full path to job directory
         job_id_to_copy: ID number of Figure 8 job to use as template for new job
+        job_name: name for new job
         aws_folder: folder in aws bucket where files be stored
         stage: specifies stage in pipeline for jobs requiring multiple rounds of annotation
         pixel_only: flag specifying whether annotators will be restricted to pixel edit mode
@@ -190,10 +194,11 @@ def create_figure_eight_job(base_dir, job_id_to_copy, aws_folder, stage,
 
     # copy job without data
     new_job_id = copy_job(job_id_to_copy, key)
-    if new_job_id == -1:
-        return
     print('New job ID is: ' + str(new_job_id))
 
+    # set name of new job
+    rename_job(new_job_id, key, job_name)
+
     # get relevant paths
     npz_paths, npz_keys, url_paths, npzs = create_job_urls(crop_dir=upload_folder,
                                                            aws_folder=aws_folder,
@@ -219,6 +224,67 @@ def create_figure_eight_job(base_dir, job_id_to_copy, aws_folder, stage,
     upload_log_file(log_file, new_job_id, key)
 
 
+def transfer_figure_eight_job(base_dir, job_id_to_copy, new_stage, job_name,
+                              rgb_mode=False, label_only=False, pixel_only=False):
+    """Create a Figure 8 job based on the output of a previous Figure8 job
+
+    Args:
+        base_dir: full path to job directory
+        job_id_to_copy: ID number of Figure 8 job to use as template for new job
+        new_stage: specifies new_stage for subsequent job
+        job_name: name for next job
+        pixel_only: flag specifying whether annotators will be restricted to pixel edit mode
+        label_only: flag specifying whether annotators will be restricted to label edit mode
+        rgb_mode: flag specifying whether annotators will view images in RGB mode
+    """
+
+    key = str(getpass("Figure eight api key?"))
+
+    # copy job without data
+    new_job_id = copy_job(job_id_to_copy, key)
+    print('New job ID is: ' + str(new_job_id))
+
+    # set name of new job
+    rename_job(new_job_id, key, job_name)
+
+    # get info from previous stage
+    log_dir = os.path.join(base_dir, 'logs')
+    previous_log_file = get_latest_log_file(log_dir)
+    previous_log = pd.read_csv(os.path.join(log_dir, previous_log_file))
+    filenames = previous_log['filename']
+    previous_stage = previous_log['stage'][0]
+    aws_folder = previous_log['aws_folder'][0]
+
+    current_bucket = os.path.join(aws_folder, previous_stage)
+    next_bucket = os.path.join(aws_folder, new_stage)
+
+    # transfer files to new stage
+    aws_copy_files(current_folder=current_bucket, next_folder=next_bucket,
+                   filenames=filenames)
+
+    new_log_name = _create_next_log_name(previous_log_file, new_stage)
+
+    # TODO: Decide if this should be handled by a separate function that is specific to transfer?
+    _, _, filepaths, _ = create_job_urls(crop_dir=os.path.join(base_dir, 'crop_dir'),
+                                         aws_folder=aws_folder, stage=new_stage,
+                                         pixel_only=pixel_only, label_only=label_only,
+                                         rgb_mode=rgb_mode)
+
+    # Generate log file for current job
+    create_upload_log(base_dir=base_dir, stage=new_stage, aws_folder=aws_folder,
+                      filenames=filenames, filepaths=filepaths, job_id=new_job_id,
+                      pixel_only=pixel_only, rgb_mode=rgb_mode, label_only=label_only,
+                      log_name=new_log_name)
+
+    log_path = open(os.path.join(base_dir, 'logs', new_log_name), 'r')
+    log_file = log_path.read()
+
+    # upload log file
+    upload_log_file(log_file, new_job_id, key)
+
+    return log_file
+
+
 def download_report(job_id, log_dir):
     """Download job report from Figure 8
 
@@ -284,7 +350,6 @@ def download_figure_eight_output(base_dir):
     job_id = log_file['job_id'][0]
 
     # download Figure 8 report
-    log_dir = os.path.join(base_dir, 'logs')
     download_report(job_id=job_id, log_dir=log_dir)
     unzip_report(log_dir=log_dir)
 

diff --git a/caliban_toolbox/log_file.py b/caliban_toolbox/log_file.py
@@ -32,10 +32,8 @@
 import numpy as np
 import requests
 
-from caliban_toolbox.aws_functions import connect_aws, aws_transfer_file
 
-
-def create_upload_log(base_dir, stage, aws_folder, filenames, filepaths, job_id,
+def create_upload_log(base_dir, stage, aws_folder, filenames, filepaths, job_id, log_name,
                       pixel_only=False, label_only=False, rgb_mode=False):
     """Generates a csv log of parameters used during job creation for subsequent use in pipeline.
 
@@ -46,6 +44,7 @@ def create_upload_log(base_dir, stage, aws_folder, filenames, filepaths, job_id,
         filenames: list of all files to be uploaded
         filepaths: list of complete urls to images in Amazon S3 bucket
         job_id: internal Figure Eight id for job
+        log_name: name for log file
         pixel_only: flag specifying whether annotators will be restricted to pixel edit mode
         label_only: flag specifying whether annotators will be restricted to label edit mode
         rgb_mode: flag specifying whether annotators will view images in RGB mode
@@ -71,88 +70,7 @@ def create_upload_log(base_dir, stage, aws_folder, filenames, filepaths, job_id,
         os.chmod(log_dir, mode)
 
     # save csv file
-    dataframe.to_csv(os.path.join(log_dir, 'stage_0_upload_log.csv'), index=False)
-
-
-# TODO: update for caliban jobs
-# def create_next_CSV(csv_dir, job_id, next_stage):
-#     """Downloads job report from a Caliban job and uses provided info to create the CSV for
-#     the next job in the sequence.
-#
-#     Returns:
-#         string: identifier used for previous job in sequence. Returned to make it easy
-#             to move the next job along without having to look somewhere to find identifier"""
-#
-#     # job_report_csv creates CSV dir if does not already exist, so we use parent directory here
-#     base_dir = os.path.dirname(csv_dir)
-#     job_report_csv = download_and_unzip(job_id, base_dir, "full")
-#
-#     s3 = connect_aws()
-#
-#     csv_data = pd.read_csv(job_report_csv)
-#
-#     filepath_list = []
-#
-#     for row in csv_data.itertuples():
-#         # get info needed to construct new project_url
-#         input_bucket = row.input_bucket
-#         output_bucket = row.output_bucket
-#         subfolders = row.subfolders
-#         stage = row.stage
-#         filename = row.filename
-#         pixel_only = row.pixel_only
-#         label_only = row.label_only
-#         rgb_mode = row.rgb_mode
-#
-#         key_src = "{0}/{1}/{2}".format(subfolders, stage, filename)
-#         key_dst = "{0}/{1}/{2}".format(subfolders, next_stage, filename)
-#
-#         # transfer output file to new key in input bucket
-#         print("Moving {0} to {1}/{2} in {3}.".format(filename, subfolders,
-#                                                      next_stage, input_bucket))
-#         aws_transfer_file(s3, input_bucket, output_bucket, key_src, key_dst)
-#
-#         subfolders = re.split('/', subfolders)
-#         subfolders = '__'.join(subfolders)
-#
-#         optional_flags = np.any(pixel_only, label_only, rgb_mode)
-#
-#         if optional_flags:
-#             optional_url = "?"
-#             if pixel_only:
-#                 optional_url += "&pixel_only=true"
-#             if label_only:
-#                 optional_url += "&label_only=true"
-#             if rgb_mode:
-#                 optional_url += "&rgb=true"
-#
-#         new_filepath = "https://caliban.deepcell.org/
-#                         {0}__{1}__{2}__{3}__{4}".format(input_bucket, output_bucket, subfolders,
-#                          next_stage, filename)
-#
-#         if optional_flags:
-#             new_filepath += optional_url
-#
-#         filepath_list.append(new_filepath)
-#
-#     data = {'project_url': filepath_list,
-#             'filename': csv_data['filename'].values,
-#             'identifier': csv_data['identifier'].values,
-#             'stage': next_stage,
-#             'input_bucket': input_bucket,
-#             'output_bucket': output_bucket,
-#             'subfolders': csv_data['subfolders'].values}
-#
-#     # pull identifier info from csv_data, this will be used in filename saving
-#     # note: not suited for job reports that have a mix of identifiers
-#     identifier = csv_data['identifier'].values[0]
-#
-#     next_job_df = pd.DataFrame(data=data, index=range(len(filepath_list)))
-#     next_csv_name = os.path.join(csv_dir, '{0}_{1}_upload.csv'.format(identifier, next_stage))
-#
-#     next_job_df.to_csv(next_csv_name, index=False)
-#
-#     return identifier
+    dataframe.to_csv(os.path.join(log_dir, log_name), index=False)
 
 
 # deprecated: this function is for figure8 PLSS output.

diff --git a/caliban_toolbox/log_file_test.py b/caliban_toolbox/log_file_test.py
@@ -40,10 +40,12 @@ def test_create_upload_log():
     job_id = '007'
 
     with tempfile.TemporaryDirectory() as temp_dir:
+        log_name = 'test_log.csv'
         create_upload_log(base_dir=temp_dir, stage=stage, aws_folder=aws_folder,
                           filenames=filenames, filepaths=filepaths, job_id=job_id,
-                          pixel_only=False, rgb_mode=True, label_only=True)
+                          pixel_only=False, rgb_mode=True, label_only=True,
+                          log_name=log_name)
 
-        log_file = pd.read_csv(os.path.join(temp_dir, 'logs/stage_0_upload_log.csv'))
+        log_file = pd.read_csv(os.path.join(temp_dir, 'logs', log_name))
 
         assert np.all(log_file['filename'] == filenames)
diff --git a/caliban_toolbox/reshape_data_test.py b/caliban_toolbox/reshape_data_test.py
@@ -25,6 +25,7 @@
 # ==============================================================================
 import os
 import tempfile
+import pytest
 
 import numpy as np
 import xarray as xr

diff --git a/caliban_toolbox/utils/crop_utils_test.py b/caliban_toolbox/utils/crop_utils_test.py
@@ -23,7 +23,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-import skimage
+import skimage.measure
 
 import numpy as np
 from caliban_toolbox import reshape_data