Merge b9cd0d9 into c08a4a6

vanvalenlab · Apr 24, 2020 · da61e08 · da61e08
2 parents c08a4a6 + b9cd0d9
commit da61e08
Show file tree

Hide file tree

Showing 30 changed files with 2,789 additions and 4,805 deletions.
diff --git a/.coveragerc b/.coveragerc
@@ -11,5 +11,3 @@ exclude_lines =
 ignore_errors = True
 fail_under = 50
 show_missing = True
-
-omit = caliban_toolbox/deprecated/*
diff --git a/.travis.yml b/.travis.yml
@@ -7,7 +7,8 @@ git:
 language: python
 
 python:
-  - 3.6
+ - 3.7.1  
+ - 3.6
 
 cache: pip
 
@@ -17,7 +18,7 @@ install:
   - pip install pytest==5.2.0 pytest-cov==2.5.1 pytest-pep8 coveralls
 
 script:
-  - python -m pytest caliban_toolbox tests
+  - python -m pytest --pep8 --cov=caliban_toolbox caliban_toolbox
 
 after_success:
   - coveralls
diff --git a/caliban_toolbox/__init__.py b/caliban_toolbox/__init__.py
@@ -24,19 +24,3 @@
 # limitations under the License.
 # ==============================================================================
 """Data Engineering Toolbox for DeepCell"""
-
-from __future__ import absolute_import
-from __future__ import print_function
-from __future__ import division
-
-from caliban_toolbox import post_annotation
-from caliban_toolbox import pre_annotation
-from caliban_toolbox import utils
-
-from caliban_toolbox.post_annotation import *
-from caliban_toolbox.pre_annotation import *
-from caliban_toolbox.utils import *
-
-del absolute_import
-del print_function
-del division
diff --git a/caliban_toolbox/aws_functions.py b/caliban_toolbox/aws_functions.py
@@ -0,0 +1,156 @@
+# Copyright 2016-2020 David Van Valen at California Institute of Technology
+# (Caltech), with support from the Paul Allen Family Foundation, Google,
+# & National Institutes of Health (NIH) under Grant U24CA224309-01.
+# All rights reserved.
+#
+# Licensed under a modified Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.github.com/vanvalenlab/caliban-toolbox/LICENSE
+#
+# The Work provided may be used for non-commercial academic purposes only.
+# For any other use of the Work, including commercial use, please contact:
+# vanvalenlab@gmail.com
+#
+# Neither the name of Caltech nor the names of its contributors may be used
+# to endorse or promote products derived from this software without specific
+# prior written permission.
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+import sys
+import boto3
+import os
+import threading
+import re
+
+from urllib.parse import urlencode
+
+import numpy as np
+from getpass import getpass
+
+from caliban_toolbox.utils.utils import get_img_names, list_npzs_folder
+
+
+# Taken from AWS Documentation
+class ProgressPercentage(object):
+    def __init__(self, filename):
+        self._filename = filename
+        self._size = float(os.path.getsize(filename))
+        self._seen_so_far = 0
+        self._lock = threading.Lock()
+
+    def __call__(self, bytes_amount):
+        with self._lock:
+            self._seen_so_far += bytes_amount
+            percentage = (self._seen_so_far / self._size) * 100
+            sys.stdout.write(
+                "\r%s  %s / %s  (%.2f%%)" % (
+                    self._filename, self._seen_so_far, self._size,
+                    percentage))
+            sys.stdout.flush()
+
+
+def connect_aws():
+    AWS_ACCESS_KEY_ID = getpass('What is your AWS access key id? ')
+    AWS_SECRET_ACCESS_KEY = getpass('What is your AWS secret access key id? ')
+
+    session = boto3.Session(aws_access_key_id=AWS_ACCESS_KEY_ID,
+                            aws_secret_access_key=AWS_SECRET_ACCESS_KEY)
+    print('Connected to AWS')
+    s3 = session.client('s3')
+
+    return s3
+
+
+def aws_upload_files(aws_folder, stage, upload_folder, pixel_only, label_only, rgb_mode):
+    """Uploads files to AWS bucket for use in Figure 8
+
+    Args:
+        aws_folder: folder where uploaded files will be stored
+        stage: specifies stage in pipeline for jobs requiring multiple rounds of annotation
+        upload_folder: path to folder containing files that will be uploaded
+        pixel_only: boolean flag to set pixel_only mode
+        label_only: boolean flag to set label_only mode
+        rgb_mode: boolean flag to set rgb_mode
+    """
+
+    s3 = connect_aws()
+
+    # load the images from specified folder but not the json log file
+    files_to_upload = list_npzs_folder(upload_folder)
+
+    filename_list = []
+
+    # change slashes separating nested folders to underscores for URL generation
+    subfolders = re.split('/', aws_folder)
+    subfolders = '__'.join(subfolders)
+
+    url_dict = {'pixel_only': pixel_only, 'label_only': label_only, 'rgb': rgb_mode}
+    url_encoded_dict = urlencode(url_dict)
+
+    # upload images
+    for img in files_to_upload:
+
+        # full path to image
+        img_path = os.path.join(upload_folder, img)
+
+        # destination path
+        img_key = os.path.join(aws_folder, stage, img)
+
+        # upload
+        s3.upload_file(img_path, 'caliban-input', img_key, Callback=ProgressPercentage(img_path),
+                       ExtraArgs={'ACL': 'public-read', 'Metadata': {'source_path': img_path}})
+        print('\n')
+
+        url = 'https://caliban.deepcell.org/{}__{}__{}__' \
+              '{}__{}?{}'.format('caliban-input', 'caliban-output', subfolders, stage, img,
+                                 url_encoded_dict)
+
+        # add caliban url to list
+        filename_list.append(url)
+
+    return files_to_upload, filename_list
+
+
+def aws_transfer_file(s3, input_bucket, output_bucket, key_src, key_dst):
+    """Helper function to transfer files from one bucket/key to another. Used
+    in conjunction with a soon-to-be-created transfer jobs script for jobs with multiple stages"""
+
+    copy_source = {'Bucket': output_bucket,
+                   'Key': key_src}
+
+    s3.copy(copy_source, input_bucket, key_dst,
+            ExtraArgs={'ACL': 'public-read'})
+
+
+def aws_download_files(upload_log, output_dir):
+    """Download files following Figure 8 annotation.
+
+    Args:
+        upload_log: pandas file containing information from upload process
+        output_dir: directory where files will be saved
+    """
+
+    s3 = connect_aws()
+
+    # get files
+    files_to_download = upload_log['filename']
+    aws_folder = upload_log['aws_folder'][0]
+    stage = upload_log['stage'][0]
+
+    # download all images
+    for img in files_to_download:
+
+        # full path to save image
+        save_path = os.path.join(output_dir, img)
+
+        # path to file in aws
+        img_path = os.path.join(aws_folder, stage, img)
+
+        s3.download_file(Bucket='caliban-output', Key=img_path, Filename=save_path)
diff --git a/caliban_toolbox/figure_eight_functions.py b/caliban_toolbox/figure_eight_functions.py
@@ -0,0 +1,197 @@
+# Copyright 2016-2020 David Van Valen at California Institute of Technology
+# (Caltech), with support from the Paul Allen Family Foundation, Google,
+# & National Institutes of Health (NIH) under Grant U24CA224309-01.
+# All rights reserved.
+#
+# Licensed under a modified Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.github.com/vanvalenlab/caliban-toolbox/LICENSE
+#
+# The Work provided may be used for non-commercial academic purposes only.
+# For any other use of the Work, including commercial use, please contact:
+# vanvalenlab@gmail.com
+#
+# Neither the name of Caltech nor the names of its contributors may be used
+# to endorse or promote products derived from this software without specific
+# prior written permission.
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+import requests
+import os
+import stat
+import zipfile
+import pandas as pd
+import urllib
+
+from getpass import getpass
+from caliban_toolbox.log_file import create_upload_log
+from caliban_toolbox.aws_functions import aws_upload_files, aws_download_files
+
+
+def copy_job(job_id, key):
+    """Helper function to create a Figure 8 job based on existing job.
+
+    Args:
+        job_id: ID number of job to copy instructions and settings from when creating new job
+        key: API key to access Figure 8 account
+
+    Returns:
+        int: ID number of job created
+    """
+
+    url = 'https://api.appen.com/v1/jobs/{}/copy.json?'.format(str(job_id))
+    API_key = {"key": key}
+
+    new_job = requests.get(url, params=API_key)
+    if new_job.status_code != 200:
+        print("copy_job not successful. Status code: ", new_job.status_code)
+    new_job_id = new_job.json()['id']
+
+    return new_job_id
+
+
+def upload_data(csv_path, job_id, key):
+    """Add data to an existing Figure 8 job by uploading a CSV file
+
+    Args:
+        csv_path: full path to csv
+        job_id: ID number of job to upload data to
+        key: API key to access Figure 8 account
+    """
+
+    # format url with appropriate arguments
+    url = "https://api.appen.com/v1/jobs/{}/upload.json?{}"
+    url_dict = {'key': key, 'force': True}
+    url_encoded_dict = urllib.parse.urlencode(url_dict)
+    url = url.format(job_id, url_encoded_dict)
+
+    csv_file = open(csv_path, 'r')
+    csv_data = csv_file.read()
+
+    headers = {"Content-Type": "text/csv"}
+
+    add_data = requests.put(url, data=csv_data, headers=headers)
+    if add_data.status_code != 200:
+        print("Upload_data not successful. Status code: ", add_data.status_code)
+    else:
+        print("Data successfully uploaded to Figure Eight.")
+
+
+def create_figure_eight_job(base_dir, job_id_to_copy, aws_folder, stage,
+                            rgb_mode=False, label_only=False, pixel_only=False):
+    """Create a Figure 8 job and upload data to it. New job ID printed out for convenience.
+    Args:
+        base_dir: full path to directory that contains CSV files
+        job_id_to_copy: ID number of Figure 8 job to use as template for new job
+        aws_folder: folder in aws bucket where files be stored
+        stage: specifies stage in pipeline for jobs requiring multiple rounds of annotation
+        pixel_only: flag specifying whether annotators will be restricted to pixel edit mode
+        label_only: flag specifying whether annotators will be restricted to label edit mode
+        rgb_mode: flag specifying whether annotators will view images in RGB mode
+    """
+
+    key = str(getpass("Figure eight api key? "))
+
+    # copy job without data
+    new_job_id = copy_job(job_id_to_copy, key)
+    if new_job_id == -1:
+        return
+    print('New job ID is: ' + str(new_job_id))
+
+    # upload files to AWS bucket
+    upload_folder = os.path.join(base_dir, 'crop_dir')
+    filenames, filepaths = aws_upload_files(aws_folder=aws_folder, stage=stage,
+                                            upload_folder=upload_folder, pixel_only=pixel_only,
+                                            rgb_mode=rgb_mode, label_only=label_only)
+
+    # Generate log file for current job
+    create_upload_log(base_dir=base_dir, stage=stage, aws_folder=aws_folder,
+                      filenames=filenames, filepaths=filepaths, job_id=new_job_id,
+                      pixel_only=pixel_only, rgb_mode=rgb_mode, label_only=label_only)
+
+    # upload NPZs using log file
+    upload_data(os.path.join(base_dir, 'logs/stage_0_upload_log.csv'), new_job_id, key)
+
+
+def download_report(job_id, log_dir):
+    """Download job report from Figure 8
+
+    Args:
+        job_id: Figure 8 job id
+        log_dir: full path to log_dir where report will be saved
+    """
+
+    if not os.path.isdir(log_dir):
+        print('Log directory does not exist: have you uploaded this job to Figure 8?')
+        os.makedirs(log_dir)
+
+        # add folder modification permissions to deal with files from file explorer
+        mode = stat.S_IRWXO | stat.S_IRWXU | stat.S_IRWXG
+        os.chmod(log_dir, mode)
+
+    save_path = os.path.join(log_dir, 'job_report.zip')
+
+    # password prompt for api info
+    key = str(getpass("Please enter your Figure Eight API key:"))
+
+    # construct url
+    url = "https://api.appen.com/v1/jobs/{}.csv?".format(job_id)
+
+    params = {"type": 'full', "key": key}
+
+    # make http request: python requests handles redirects
+    csv_request = requests.get(url, params=params, allow_redirects=True)
+    open(save_path, 'wb').write(csv_request.content)
+    print('Report saved to folder')
+
+
+def unzip_report(log_dir):
+    """Unzips .csv file and renames it appropriately
+
+    Args:
+        log_dir: full path to log_dir for saving zip
+    """
+
+    # Extract zip
+    zip_path = os.path.join(log_dir, 'job_report.zip')
+    with zipfile.ZipFile(zip_path, "r") as zip_ref:
+        default_name = zip_ref.namelist()[0]  # get filename so can rename later
+        zip_ref.extractall(log_dir)
+
+    # rename from Figure 8 default
+    default_name_path = os.path.join(log_dir, default_name)  # should only be one file in zip
+    new_name_path = os.path.join(log_dir, 'job_report.csv')
+    os.rename(default_name_path, new_name_path)
+
+
+def download_figure_eight_output(base_dir):
+    """Gets annotated files from a Figure 8 job
+
+    Args:
+        base_dir: directory containing relevant job files
+    """
+
+    # get information from job creation
+    # TODO: check for latest stage job report and use that one
+    log_file = pd.read_csv(os.path.join(base_dir, 'logs/stage_0_upload_log.csv'))
+    job_id = log_file['job_id'][0]
+
+    # download Figure 8 report
+    log_dir = os.path.join(base_dir, 'logs')
+    download_report(job_id=job_id, log_dir=log_dir)
+    unzip_report(log_dir=log_dir)
+
+    # download annotations from aws
+    output_dir = os.path.join(base_dir, 'output')
+    if not os.path.isdir(output_dir):
+        os.makedirs(output_dir)
+
+    upload_log = pd.read_csv(os.path.join(base_dir, 'logs/stage_0_upload_log.csv'))
+    aws_download_files(upload_log, output_dir)