-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
9 changed files
with
484 additions
and
352 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,144 @@ | ||
# Copyright 2016-2020 The Van Valen Lab at the California Institute of | ||
# Technology (Caltech), with support from the Paul Allen Family Foundation, | ||
# Google, & National Institutes of Health (NIH) under Grant U24CA224309-01. | ||
# All rights reserved. | ||
# | ||
# Licensed under a modified Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.github.com/vanvalenlab/caliban-toolbox/LICENSE | ||
# | ||
# The Work provided may be used for non-commercial academic purposes only. | ||
# For any other use of the Work, including commercial use, please contact: | ||
# vanvalenlab@gmail.com | ||
# | ||
# Neither the name of Caltech nor the names of its contributors may be used | ||
# to endorse or promote products derived from this software without specific | ||
# prior written permission. | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
# ============================================================================== | ||
import os | ||
|
||
import pandas as pd | ||
|
||
from caliban_toolbox import crowdsource | ||
from caliban_toolbox.utils.misc_utils import list_npzs_folder | ||
from caliban_toolbox.aws_functions import aws_upload_files, aws_download_files | ||
|
||
|
||
def create_upload_log(base_dir, stage, aws_folder, filenames, filepaths, log_name, | ||
pixel_only=False, label_only=False, rgb_mode=False): | ||
"""Generates a csv log of parameters used during job creation for subsequent use in pipeline. | ||
Args: | ||
base_dir: full path to directory where job results will be stored | ||
stage: specifies stage in pipeline for jobs requiring multiple rounds of annotation | ||
aws_folder: folder in aws bucket where files be stored | ||
filenames: list of all files to be uploaded | ||
filepaths: list of complete urls to images in Amazon S3 bucket | ||
log_name: name for log file | ||
pixel_only: flag specifying whether annotators will be restricted to pixel edit mode | ||
label_only: flag specifying whether annotators will be restricted to label edit mode | ||
rgb_mode: flag specifying whether annotators will view images in RGB mode | ||
""" | ||
|
||
data = {'project_url': filepaths, | ||
'filename': filenames, | ||
'stage': stage, | ||
'aws_folder': aws_folder, | ||
'pixel_only': pixel_only, | ||
'label_only': label_only, | ||
'rgb_mode': rgb_mode} | ||
dataframe = pd.DataFrame(data=data, index=range(len(filepaths))) | ||
|
||
# create file location, name file | ||
log_dir = os.path.join(base_dir, 'logs') | ||
if not os.path.isdir(log_dir): | ||
os.makedirs(log_dir) | ||
|
||
# save csv file | ||
dataframe.to_csv(os.path.join(log_dir, log_name), index=False) | ||
|
||
# create csv containing only URLs to upload to anolytics website | ||
small_df = pd.DataFrame({'project_url': filepaths}) | ||
small_df.to_csv(os.path.join(log_dir, 'small_' + log_name)) | ||
|
||
|
||
def create_anolytics_job(base_dir, aws_folder, stage, rgb_mode=False, label_only=False, | ||
pixel_only=False): | ||
"""Create a log file and upload NPZs to aws for an anolytics job. | ||
Args: | ||
base_dir: full path to job directory | ||
aws_folder: folder in aws bucket where files be stored | ||
stage: specifies stage in pipeline for jobs requiring multiple rounds of annotation | ||
pixel_only: flag specifying whether annotators will be restricted to pixel edit mode | ||
label_only: flag specifying whether annotators will be restricted to label edit mode | ||
rgb_mode: flag specifying whether annotators will view images in RGB mode | ||
Raises: | ||
ValueError: If invalid base_dir supplied | ||
ValueError: If no crop directory found within base_dir | ||
ValueError: If no NPZs found in crop directory | ||
""" | ||
|
||
if not os.path.isdir(base_dir): | ||
raise ValueError('Invalid directory name') | ||
|
||
upload_folder = os.path.join(base_dir, 'crop_dir') | ||
|
||
if not os.path.isdir(upload_folder): | ||
raise ValueError('No crop directory found within base directory') | ||
|
||
if len(list_npzs_folder(upload_folder)) == 0: | ||
raise ValueError('No NPZs found in crop dir') | ||
|
||
# get relevant paths | ||
npz_paths, npz_keys, url_paths, npzs = crowdsource.create_job_urls(crop_dir=upload_folder, | ||
aws_folder=aws_folder, | ||
stage=stage, | ||
pixel_only=pixel_only, | ||
label_only=label_only, | ||
rgb_mode=rgb_mode) | ||
|
||
# upload files to AWS bucket | ||
aws_upload_files(local_paths=npz_paths, aws_paths=npz_keys) | ||
|
||
log_name = 'stage_0_{}_upload_log.csv'.format(stage) | ||
|
||
# Generate log file for current job | ||
create_upload_log(base_dir=base_dir, stage=stage, aws_folder=aws_folder, | ||
filenames=npzs, filepaths=url_paths, | ||
pixel_only=pixel_only, rgb_mode=rgb_mode, label_only=label_only, | ||
log_name=log_name) | ||
|
||
|
||
def download_anolytics_output(base_dir): | ||
"""Gets annotated files from an anolytics job | ||
Args: | ||
base_dir: directory containing relevant job files | ||
Returns: | ||
list: file names of NPZs not found in AWS bucket | ||
""" | ||
|
||
# get information from job creation | ||
log_dir = os.path.join(base_dir, 'logs') | ||
latest_log = crowdsource.get_latest_log_file(log_dir) | ||
log_file = pd.read_csv(os.path.join(log_dir, latest_log)) | ||
|
||
# download annotations from aws | ||
output_dir = os.path.join(base_dir, 'output') | ||
if not os.path.isdir(output_dir): | ||
os.makedirs(output_dir) | ||
|
||
missing = aws_download_files(log_file, output_dir) | ||
|
||
return missing |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
# Copyright 2016-2020 The Van Valen Lab at the California Institute of | ||
# Technology (Caltech), with support from the Paul Allen Family Foundation, | ||
# Google, & National Institutes of Health (NIH) under Grant U24CA224309-01. | ||
# All rights reserved. | ||
# | ||
# Licensed under a modified Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.github.com/vanvalenlab/caliban-toolbox/LICENSE | ||
# | ||
# The Work provided may be used for non-commercial academic purposes only. | ||
# For any other use of the Work, including commercial use, please contact: | ||
# vanvalenlab@gmail.com | ||
# | ||
# Neither the name of Caltech nor the names of its contributors may be used | ||
# to endorse or promote products derived from this software without specific | ||
# prior written permission. | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
# ============================================================================== | ||
import os | ||
|
||
import numpy as np | ||
import pandas as pd | ||
|
||
from caliban_toolbox import anolytics | ||
from caliban_toolbox.aws_functions_test import FakeS3 | ||
|
||
|
||
def test_create_anolytics_job(mocker, tmp_path): | ||
mocker.patch('getpass.getpass', lambda *x: 'test_api_key') | ||
mocker.patch('boto3.Session', FakeS3) | ||
|
||
# create crop directory | ||
crop_dir = os.path.join(tmp_path, 'crop_dir') | ||
os.makedirs(crop_dir) | ||
np.savez(os.path.join(crop_dir, 'test_crop.npz')) | ||
|
||
anolytics.create_anolytics_job(base_dir=tmp_path, | ||
aws_folder='aws', | ||
stage='stage') | ||
|
||
test_log_name = os.path.join(tmp_path, 'logs', 'stage_0_{}_upload_log.csv'.format('stage')) | ||
assert os.path.exists(test_log_name) | ||
|
||
|
||
def test_download_anolytics_output(mocker, tmp_path): | ||
mocker.patch('getpass.getpass', lambda *x: 'test_api_key') | ||
mocker.patch('boto3.Session', FakeS3) | ||
|
||
# create logs directory with upload log | ||
os.makedirs(os.path.join(tmp_path, 'logs')) | ||
log_dict = {'filename': ['example_1.npz', 'example_2.npz'], | ||
'aws_folder': ['example_folder', 'example_folder'], | ||
'stage': ['stage_0', 'stage_0'] | ||
} | ||
|
||
log_file = pd.DataFrame(log_dict) | ||
|
||
log_file.to_csv(os.path.join(tmp_path, 'logs', 'stage_0_upload_log.csv')) | ||
|
||
missing = anolytics.download_anolytics_output(tmp_path) | ||
assert missing == [] | ||
|
||
# catch missing file error, return list of missing files | ||
mocker.patch('boto3.Session', | ||
lambda aws_access_key_id, aws_secret_access_key: FakeS3(raise_error='missing')) | ||
missing = anolytics.download_anolytics_output(tmp_path) | ||
missing = [os.path.split(file_path)[1] for file_path in missing] | ||
assert missing == log_dict['filename'] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
# Copyright 2016-2020 The Van Valen Lab at the California Institute of | ||
# Technology (Caltech), with support from the Paul Allen Family Foundation, | ||
# Google, & National Institutes of Health (NIH) under Grant U24CA224309-01. | ||
# All rights reserved. | ||
# | ||
# Licensed under a modified Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.github.com/vanvalenlab/caliban-toolbox/LICENSE | ||
# | ||
# The Work provided may be used for non-commercial academic purposes only. | ||
# For any other use of the Work, including commercial use, please contact: | ||
# vanvalenlab@gmail.com | ||
# | ||
# Neither the name of Caltech nor the names of its contributors may be used | ||
# to endorse or promote products derived from this software without specific | ||
# prior written permission. | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
# ============================================================================== | ||
|
||
import os | ||
import re | ||
|
||
from urllib.parse import urlencode | ||
|
||
from caliban_toolbox.utils.misc_utils import list_npzs_folder | ||
|
||
|
||
def _format_url(aws_folder, stage, npz, url_encoded_dict): | ||
base_url = 'https://caliban.deepcell.org/caliban-input__caliban-output__{}__{}__{}?{}' | ||
formatted_url = base_url.format(aws_folder, stage, npz, url_encoded_dict) | ||
|
||
return formatted_url | ||
|
||
|
||
def _create_next_log_name(previous_log_name, stage): | ||
stage_num = int(previous_log_name.split('_')[1]) | ||
new_log = 'stage_{}_{}_upload_log.csv'.format(stage_num + 1, stage) | ||
|
||
return new_log | ||
|
||
|
||
def get_latest_log_file(log_dir): | ||
"""Find the latest log file in the log directory | ||
Args: | ||
log_dir: full path to log directory | ||
Returns: | ||
string: name of the latest log file | ||
""" | ||
files = os.listdir(log_dir) | ||
log_files = [file for file in files if 'upload_log.csv' in file] | ||
log_files.sort() | ||
|
||
return log_files[-1] | ||
|
||
|
||
def create_job_urls(crop_dir, aws_folder, stage, pixel_only, label_only, rgb_mode): | ||
"""Helper function to create relevant URLs for caliban log and AWS upload | ||
Args: | ||
crop_dir: full path to directory with the npz crops | ||
aws_folder: path for images to be stored in AWS | ||
stage: which stage of the correction process this job is for | ||
pixel_only: boolean flag to determine if only pixel mode is available | ||
label_only: boolean flag to determine if only label is available | ||
rgb_mode: boolean flag to determine if rgb mode will be enabled | ||
Returns: | ||
list: list of paths to local NPZs to be uploaded | ||
list: list of paths to desintation for NPZs | ||
list: list of URLs to supply to figure8 to to display crops | ||
list: list of NPZs that will be uploaded | ||
Raises: | ||
ValueError: If URLs are not valid | ||
""" | ||
# TODO: check that URLS don't contain invalid character | ||
# load the images from specified folder but not the json log file | ||
npzs_to_upload = list_npzs_folder(crop_dir) | ||
|
||
# change slashes separating nested folders to underscores for URL generation | ||
subfolders = re.split('/', aws_folder) | ||
subfolders = '__'.join(subfolders) | ||
|
||
# create dictionary to hold boolean flags | ||
url_dict = {'pixel_only': pixel_only, 'label_only': label_only, 'rgb': rgb_mode} | ||
url_encoded_dict = urlencode(url_dict) | ||
|
||
# create path to npz, key to upload npz, and url path for figure8 | ||
npz_paths, npz_keys, url_paths = [], [], [] | ||
for npz in npzs_to_upload: | ||
npz_paths.append(os.path.join(crop_dir, npz)) | ||
npz_keys.append(os.path.join(aws_folder, stage, npz)) | ||
url_paths.append(_format_url(subfolders, stage, npz, url_encoded_dict)) | ||
|
||
# TODO: think about better way to structure than many lists | ||
return npz_paths, npz_keys, url_paths, npzs_to_upload |
Oops, something went wrong.