Skip to content

Commit

Permalink
Merge 06cc0e4 into 23d9e2a
Browse files Browse the repository at this point in the history
  • Loading branch information
ngreenwald committed Jun 7, 2020
2 parents 23d9e2a + 06cc0e4 commit e9d9e8e
Show file tree
Hide file tree
Showing 8 changed files with 126 additions and 176 deletions.
30 changes: 20 additions & 10 deletions caliban_toolbox/aws_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,9 +70,9 @@ def connect_aws():
def aws_upload_files(local_paths, aws_paths):
"""Uploads files to AWS bucket for use in Figure 8
Args:
local_paths: list of paths to npz files
aws_paths: list of paths for saving npz files in AWS
Args:
local_paths: list of paths to npz files
aws_paths: list of paths for saving npz files in AWS
"""

s3 = connect_aws()
Expand All @@ -86,17 +86,27 @@ def aws_upload_files(local_paths, aws_paths):
print('\n')


def aws_transfer_file(s3, input_bucket, output_bucket, key_src, key_dst):
"""Helper function to transfer files from one bucket/key to another. Used
in conjunction with a soon-to-be-created transfer jobs script for jobs with multiple stages"""
def aws_copy_files(current_folder, next_folder, filenames):
"""Copy files from one AWS bucket to another.
Args:
current_folder: aws folder with current files
next_folder: aws folder where files will be copied
filenames: list of NPZ files to copy
"""

s3 = connect_aws()

copy_source = {'Bucket': output_bucket,
'Key': key_src}
for file in filenames:
copy_source = {'Bucket': 'caliban-output',
'Key': os.path.join(current_folder, file)}

s3.copy(copy_source, input_bucket, key_dst,
ExtraArgs={'ACL': 'public-read'})
s3.copy(CopySource=copy_source, Bucket='caliban-input',
Key=os.path.join(next_folder, file),
ExtraArgs={'ACL': 'public-read'})


# TODO: catch missing files
def aws_download_files(upload_log, output_dir):
"""Download files following Figure 8 annotation.
Expand Down
85 changes: 75 additions & 10 deletions caliban_toolbox/figure_eight_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import json
import requests
import os
import stat
Expand All @@ -35,7 +36,7 @@
from urllib.parse import urlencode

from caliban_toolbox.log_file import create_upload_log
from caliban_toolbox.aws_functions import aws_upload_files, aws_download_files
from caliban_toolbox.aws_functions import aws_upload_files, aws_copy_files, aws_download_files
from caliban_toolbox.utils.misc_utils import list_npzs_folder


Expand All @@ -47,16 +48,18 @@ def _format_url(aws_folder, stage, npz, url_encoded_dict):


def _create_next_log_name(previous_log_name, stage):
stage_num = previous_log_name.split('_')[1]
stage_num = int(previous_log_name.split('_')[1])
new_log = 'stage_{}_{}_upload_log.csv'.format(stage_num + 1, stage)

return new_log


def get_latest_log_file(log_dir):
"""Find the latest log file in the log directory
Args:
log_dir: full path to log directory
Returns:
string: name of the latest log file
"""
Expand All @@ -69,18 +72,21 @@ def get_latest_log_file(log_dir):

def create_job_urls(crop_dir, aws_folder, stage, pixel_only, label_only, rgb_mode):
"""Helper function to create relevant URLs for caliban log and AWS upload
Args:
crop_dir: full path to directory with the npz crops
aws_folder: path for images to be stored in AWS
stage: which stage of the correction process this job is for
pixel_only: boolean flag to determine if only pixel mode is available
label_only: boolean flag to determine if only label is available
rgb_mode: boolean flag to determine if rgb mode will be enabled
Returns:
list: list of paths to local NPZs to be uploaded
list: list of paths to desintation for NPZs
list: list of URLs to supply to figure8 to to display crops
list: list of NPZs that will be uploaded
Raises:
ValueError: If URLs are not valid
"""
Expand All @@ -103,7 +109,7 @@ def create_job_urls(crop_dir, aws_folder, stage, pixel_only, label_only, rgb_mod
npz_keys.append(os.path.join(aws_folder, stage, npz))
url_paths.append(_format_url(subfolders, stage, npz, url_encoded_dict))

# TODO: think about better way to structure than than many lists
# TODO: think about better way to structure than many lists
return npz_paths, npz_keys, url_paths, npzs_to_upload


Expand Down Expand Up @@ -144,9 +150,6 @@ def upload_log_file(log_file, job_id, key):
url_encoded_dict = urllib.parse.urlencode(url_dict)
url = url.format(job_id, url_encoded_dict)

csv_file = open(csv_path, 'r')
csv_data = csv_file.read()

headers = {"Content-Type": "text/csv"}
add_data = requests.put(url, data=log_file, headers=headers)

Expand All @@ -156,13 +159,14 @@ def upload_log_file(log_file, job_id, key):
print("Data successfully uploaded to Figure Eight.")


def create_figure_eight_job(base_dir, job_id_to_copy, aws_folder, stage,
def create_figure_eight_job(base_dir, job_id_to_copy, job_name, aws_folder, stage,
rgb_mode=False, label_only=False, pixel_only=False):
"""Create a Figure 8 job and upload data to it. New job ID printed out for convenience.
Args:
base_dir: full path to job directory
job_id_to_copy: ID number of Figure 8 job to use as template for new job
job_name: name for new job
aws_folder: folder in aws bucket where files be stored
stage: specifies stage in pipeline for jobs requiring multiple rounds of annotation
pixel_only: flag specifying whether annotators will be restricted to pixel edit mode
Expand Down Expand Up @@ -190,10 +194,11 @@ def create_figure_eight_job(base_dir, job_id_to_copy, aws_folder, stage,

# copy job without data
new_job_id = copy_job(job_id_to_copy, key)
if new_job_id == -1:
return
print('New job ID is: ' + str(new_job_id))

# set name of new job
rename_job(new_job_id, key, job_name)

# get relevant paths
npz_paths, npz_keys, url_paths, npzs = create_job_urls(crop_dir=upload_folder,
aws_folder=aws_folder,
Expand All @@ -219,6 +224,67 @@ def create_figure_eight_job(base_dir, job_id_to_copy, aws_folder, stage,
upload_log_file(log_file, new_job_id, key)


def transfer_figure_eight_job(base_dir, job_id_to_copy, new_stage, job_name,
rgb_mode=False, label_only=False, pixel_only=False):
"""Create a Figure 8 job based on the output of a previous Figure8 job
Args:
base_dir: full path to job directory
job_id_to_copy: ID number of Figure 8 job to use as template for new job
new_stage: specifies new_stage for subsequent job
job_name: name for next job
pixel_only: flag specifying whether annotators will be restricted to pixel edit mode
label_only: flag specifying whether annotators will be restricted to label edit mode
rgb_mode: flag specifying whether annotators will view images in RGB mode
"""

key = str(getpass("Figure eight api key?"))

# copy job without data
new_job_id = copy_job(job_id_to_copy, key)
print('New job ID is: ' + str(new_job_id))

# set name of new job
rename_job(new_job_id, key, job_name)

# get info from previous stage
log_dir = os.path.join(base_dir, 'logs')
previous_log_file = get_latest_log_file(log_dir)
previous_log = pd.read_csv(os.path.join(log_dir, previous_log_file))
filenames = previous_log['filename']
previous_stage = previous_log['stage'][0]
aws_folder = previous_log['aws_folder'][0]

current_bucket = os.path.join(aws_folder, previous_stage)
next_bucket = os.path.join(aws_folder, new_stage)

# transfer files to new stage
aws_copy_files(current_folder=current_bucket, next_folder=next_bucket,
filenames=filenames)

new_log_name = _create_next_log_name(previous_log_file, new_stage)

# TODO: Decide if this should be handled by a separate function that is specific to transfer?
_, _, filepaths, _ = create_job_urls(crop_dir=os.path.join(base_dir, 'crop_dir'),
aws_folder=aws_folder, stage=new_stage,
pixel_only=pixel_only, label_only=label_only,
rgb_mode=rgb_mode)

# Generate log file for current job
create_upload_log(base_dir=base_dir, stage=new_stage, aws_folder=aws_folder,
filenames=filenames, filepaths=filepaths, job_id=new_job_id,
pixel_only=pixel_only, rgb_mode=rgb_mode, label_only=label_only,
log_name=new_log_name)

log_path = open(os.path.join(base_dir, 'logs', new_log_name), 'r')
log_file = log_path.read()

# upload log file
upload_log_file(log_file, new_job_id, key)

return log_file


def download_report(job_id, log_dir):
"""Download job report from Figure 8
Expand Down Expand Up @@ -284,7 +350,6 @@ def download_figure_eight_output(base_dir):
job_id = log_file['job_id'][0]

# download Figure 8 report
log_dir = os.path.join(base_dir, 'logs')
download_report(job_id=job_id, log_dir=log_dir)
unzip_report(log_dir=log_dir)

Expand Down
88 changes: 3 additions & 85 deletions caliban_toolbox/log_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,8 @@
import numpy as np
import requests

from caliban_toolbox.aws_functions import connect_aws, aws_transfer_file


def create_upload_log(base_dir, stage, aws_folder, filenames, filepaths, job_id,
def create_upload_log(base_dir, stage, aws_folder, filenames, filepaths, job_id, log_name,
pixel_only=False, label_only=False, rgb_mode=False):
"""Generates a csv log of parameters used during job creation for subsequent use in pipeline.
Expand All @@ -46,6 +44,7 @@ def create_upload_log(base_dir, stage, aws_folder, filenames, filepaths, job_id,
filenames: list of all files to be uploaded
filepaths: list of complete urls to images in Amazon S3 bucket
job_id: internal Figure Eight id for job
log_name: name for log file
pixel_only: flag specifying whether annotators will be restricted to pixel edit mode
label_only: flag specifying whether annotators will be restricted to label edit mode
rgb_mode: flag specifying whether annotators will view images in RGB mode
Expand All @@ -71,88 +70,7 @@ def create_upload_log(base_dir, stage, aws_folder, filenames, filepaths, job_id,
os.chmod(log_dir, mode)

# save csv file
dataframe.to_csv(os.path.join(log_dir, 'stage_0_upload_log.csv'), index=False)


# TODO: update for caliban jobs
# def create_next_CSV(csv_dir, job_id, next_stage):
# """Downloads job report from a Caliban job and uses provided info to create the CSV for
# the next job in the sequence.
#
# Returns:
# string: identifier used for previous job in sequence. Returned to make it easy
# to move the next job along without having to look somewhere to find identifier"""
#
# # job_report_csv creates CSV dir if does not already exist, so we use parent directory here
# base_dir = os.path.dirname(csv_dir)
# job_report_csv = download_and_unzip(job_id, base_dir, "full")
#
# s3 = connect_aws()
#
# csv_data = pd.read_csv(job_report_csv)
#
# filepath_list = []
#
# for row in csv_data.itertuples():
# # get info needed to construct new project_url
# input_bucket = row.input_bucket
# output_bucket = row.output_bucket
# subfolders = row.subfolders
# stage = row.stage
# filename = row.filename
# pixel_only = row.pixel_only
# label_only = row.label_only
# rgb_mode = row.rgb_mode
#
# key_src = "{0}/{1}/{2}".format(subfolders, stage, filename)
# key_dst = "{0}/{1}/{2}".format(subfolders, next_stage, filename)
#
# # transfer output file to new key in input bucket
# print("Moving {0} to {1}/{2} in {3}.".format(filename, subfolders,
# next_stage, input_bucket))
# aws_transfer_file(s3, input_bucket, output_bucket, key_src, key_dst)
#
# subfolders = re.split('/', subfolders)
# subfolders = '__'.join(subfolders)
#
# optional_flags = np.any(pixel_only, label_only, rgb_mode)
#
# if optional_flags:
# optional_url = "?"
# if pixel_only:
# optional_url += "&pixel_only=true"
# if label_only:
# optional_url += "&label_only=true"
# if rgb_mode:
# optional_url += "&rgb=true"
#
# new_filepath = "https://caliban.deepcell.org/
# {0}__{1}__{2}__{3}__{4}".format(input_bucket, output_bucket, subfolders,
# next_stage, filename)
#
# if optional_flags:
# new_filepath += optional_url
#
# filepath_list.append(new_filepath)
#
# data = {'project_url': filepath_list,
# 'filename': csv_data['filename'].values,
# 'identifier': csv_data['identifier'].values,
# 'stage': next_stage,
# 'input_bucket': input_bucket,
# 'output_bucket': output_bucket,
# 'subfolders': csv_data['subfolders'].values}
#
# # pull identifier info from csv_data, this will be used in filename saving
# # note: not suited for job reports that have a mix of identifiers
# identifier = csv_data['identifier'].values[0]
#
# next_job_df = pd.DataFrame(data=data, index=range(len(filepath_list)))
# next_csv_name = os.path.join(csv_dir, '{0}_{1}_upload.csv'.format(identifier, next_stage))
#
# next_job_df.to_csv(next_csv_name, index=False)
#
# return identifier
dataframe.to_csv(os.path.join(log_dir, log_name), index=False)


# deprecated: this function is for figure8 PLSS output.
Expand Down
6 changes: 4 additions & 2 deletions caliban_toolbox/log_file_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,12 @@ def test_create_upload_log():
job_id = '007'

with tempfile.TemporaryDirectory() as temp_dir:
log_name = 'test_log.csv'
create_upload_log(base_dir=temp_dir, stage=stage, aws_folder=aws_folder,
filenames=filenames, filepaths=filepaths, job_id=job_id,
pixel_only=False, rgb_mode=True, label_only=True)
pixel_only=False, rgb_mode=True, label_only=True,
log_name=log_name)

log_file = pd.read_csv(os.path.join(temp_dir, 'logs/stage_0_upload_log.csv'))
log_file = pd.read_csv(os.path.join(temp_dir, 'logs', log_name))

assert np.all(log_file['filename'] == filenames)
1 change: 1 addition & 0 deletions caliban_toolbox/reshape_data_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
# ==============================================================================
import os
import tempfile
import pytest

import numpy as np
import xarray as xr
Expand Down
2 changes: 1 addition & 1 deletion caliban_toolbox/utils/crop_utils_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import skimage
import skimage.measure

import numpy as np
from caliban_toolbox import reshape_data
Expand Down

0 comments on commit e9d9e8e

Please sign in to comment.