Skip to content

Commit

Permalink
Merge 582249c into 23d9e2a
Browse files Browse the repository at this point in the history
  • Loading branch information
ngreenwald committed Jun 7, 2020
2 parents 23d9e2a + 582249c commit 7ac835e
Show file tree
Hide file tree
Showing 8 changed files with 465 additions and 2 deletions.
79 changes: 79 additions & 0 deletions caliban_toolbox/metadata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
# Copyright 2016-2020 The Van Valen Lab at the California Institute of
# Technology (Caltech), with support from the Paul Allen Family Foundation,
# Google, & National Institutes of Health (NIH) under Grant U24CA224309-01.
# All rights reserved.
#
# Licensed under a modified Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.github.com/vanvalenlab/caliban-toolbox/LICENSE
#
# The Work provided may be used for non-commercial academic purposes only.
# For any other use of the Work, including commercial use, please contact:
# vanvalenlab@gmail.com
#
# Neither the name of Caltech nor the names of its contributors may be used
# to endorse or promote products derived from this software without specific
# prior written permission.
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import pandas as pd
import numpy as np


def make_experiment_metadata_file(raw_metadata, image_names):
"""Creates a metadata file for a specific experiment
Args:
raw_metadata: metadata file from the raw ontology
image_names: names of images that are being processed
Returns:
pd.DataFrame: metadata file
"""

experiment_metadata = pd.DataFrame({'PROJECT_ID': raw_metadata['PROJECT_ID'],
'EXPERIMENT_ID': raw_metadata['EXPERIMENT_ID'],
'image_name': image_names,
'job_folder': 'NA',
'job_id': 'NA',
'status': 'awaiting_prediction'
})

return experiment_metadata


def update_job_metadata(metadata, update_dict):
"""Updates a metadata for a specific job
Args:
metadata: the metadata file to be updated
update_dict: the dictionary containing the update stats for the job
Returns:
pd.DataFrame: updated metadata file
"""

# TODO: check that these images belong to specific job
# TODO: figure out workflow for remaining in progress jobs

in_progress = metadata.loc[metadata.status == 'in_progress', 'image_name']
included, excluded = update_dict['included'], update_dict['excluded']

# make sure supplied excluded and included images are in progress for this job
if not np.all(np.isin(included, in_progress)):
raise ValueError('Invalid fovs supplied')

if not np.all(np.isin(excluded, in_progress)):
raise ValueError('Invalid fovs supplied')

metadata.loc[np.isin(metadata.image_name, included), 'status'] = 'included'
metadata.loc[np.isin(metadata.image_name, excluded), 'status'] = 'excluded'

return metadata
79 changes: 79 additions & 0 deletions caliban_toolbox/metadata_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
# Copyright 2016-2020 The Van Valen Lab at the California Institute of
# Technology (Caltech), with support from the Paul Allen Family Foundation,
# Google, & National Institutes of Health (NIH) under Grant U24CA224309-01.
# All rights reserved.
#
# Licensed under a modified Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.github.com/vanvalenlab/caliban-toolbox/LICENSE
#
# The Work provided may be used for non-commercial academic purposes only.
# For any other use of the Work, including commercial use, please contact:
# vanvalenlab@gmail.com
#
# Neither the name of Caltech nor the names of its contributors may be used
# to endorse or promote products derived from this software without specific
# prior written permission.
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import numpy as np

from caliban_toolbox import metadata
import importlib
importlib.reload(metadata)


def _make_raw_metadata():
metadata_file = {'PROJECT_ID': np.random.randint(1, 100),
'EXPERIMENT_ID': np.random.randint(1, 100)}

return metadata_file


def _make_fov_ids(num_fovs):
all_fovs = np.random.randint(low=1, high=num_fovs * 10, size=num_fovs)
fovs = ['fov_{}'.format(i) for i in all_fovs]

return fovs


def test_make_experiment_metadata_file():
raw_metadata = _make_raw_metadata()
image_names = _make_fov_ids(10)
experiment_metadata = metadata.make_experiment_metadata_file(raw_metadata, image_names)

assert experiment_metadata.loc[0, 'PROJECT_ID'] == raw_metadata['PROJECT_ID']
assert experiment_metadata.loc[0, 'EXPERIMENT_ID'] == raw_metadata['EXPERIMENT_ID']
assert np.all(np.isin(image_names, experiment_metadata['image_name']))


def test_update_job_metadata():
raw_metadata = _make_raw_metadata()
image_names = _make_fov_ids(10)
experiment_metadata = metadata.make_experiment_metadata_file(raw_metadata, image_names)
experiment_metadata['status'] = 'in_progress'

included_images = image_names[:6]
excluded_images = image_names[6:8]
in_process = image_names[8:]

updated_metadata = metadata.update_job_metadata(metadata=experiment_metadata,
update_dict={'included': included_images,
'excluded': excluded_images})
pred_included = updated_metadata.loc[updated_metadata.status == 'included', 'image_name']
assert np.all(np.isin(pred_included, included_images))

pred_excluded = updated_metadata.loc[updated_metadata.status == 'excluded', 'image_name']
assert np.all(np.isin(pred_excluded, excluded_images))

pred_in_progress = updated_metadata.loc[updated_metadata.status == 'awaiting_prediction',
'image_name']

assert np.all(np.isin(pred_in_progress, in_process))
85 changes: 85 additions & 0 deletions caliban_toolbox/pipeline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
# Copyright 2016-2020 The Van Valen Lab at the California Institute of
# Technology (Caltech), with support from the Paul Allen Family Foundation,
# Google, & National Institutes of Health (NIH) under Grant U24CA224309-01.
# All rights reserved.
#
# Licensed under a modified Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.github.com/vanvalenlab/caliban-toolbox/LICENSE
#
# The Work provided may be used for non-commercial academic purposes only.
# For any other use of the Work, including commercial use, please contact:
# vanvalenlab@gmail.com
#
# Neither the name of Caltech nor the names of its contributors may be used
# to endorse or promote products derived from this software without specific
# prior written permission.
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

import os

import numpy as np
import pandas as pd
import xarray as xr

from caliban_toolbox import metadata
from caliban_toolbox.utils.pipeline_utils import get_job_folder_name


def create_experiment_folder(image_names, raw_metadata, base_dir):
"""Takes the output of the data loader and creates an experiment folder
Args:
image_names: names of images from current experiment
raw_metadata: metadata file from raw ontology
base_dir: directory where experiment folder will be created
Returns:
string: full path to newly created experiment folder
"""

experiment_id = raw_metadata['EXPERIMENT_ID']
experiment_folder = os.path.join(base_dir, 'experiment_{}'.format(experiment_id))
os.makedirs(experiment_folder)

# create metadata file
exp_metadata = metadata.make_experiment_metadata_file(raw_metadata, image_names)

# save metadata file
exp_metadata.to_csv(os.path.join(experiment_folder, 'metadata.csv'))

return experiment_folder


def create_job_folder(experiment_dir, metadata, fov_data, fov_names, fov_num):
"""Creates a folder to hold a single caliban job
Args:
experiment_dir: directory of relevant experiment
fov_num: number of FOVs to include in job
"""

# Create sequentially named job folder
job_folder_path, job_name = get_job_folder_name(experiment_dir)
os.makedirs(job_folder_path)

available_fovs = metadata[metadata['status'] == 'awaiting_prediction']
new_fov_names = available_fovs['image_name'][:fov_num].values

metadata.loc[metadata['image_name'].isin(new_fov_names),
['status', 'job_folder']] = 'in_progress', job_name

fov_idx = np.isin(fov_names, new_fov_names)

new_fov_data = fov_data[fov_idx]

np.savez(os.path.join(job_folder_path, 'raw_data.npz'), X=new_fov_data)
metadata.to_csv(os.path.join(experiment_dir, 'metadata.csv'))
90 changes: 90 additions & 0 deletions caliban_toolbox/pipeline_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
# Copyright 2016-2020 The Van Valen Lab at the California Institute of
# Technology (Caltech), with support from the Paul Allen Family Foundation,
# Google, & National Institutes of Health (NIH) under Grant U24CA224309-01.
# All rights reserved.
#
# Licensed under a modified Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.github.com/vanvalenlab/caliban-toolbox/LICENSE
#
# The Work provided may be used for non-commercial academic purposes only.
# For any other use of the Work, including commercial use, please contact:
# vanvalenlab@gmail.com
#
# Neither the name of Caltech nor the names of its contributors may be used
# to endorse or promote products derived from this software without specific
# prior written permission.
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import os
import tempfile

import numpy as np
import pandas as pd
import xarray as xr

from caliban_toolbox import pipeline
import importlib
importlib.reload(pipeline)


def _make_raw_metadata():
metadata_file = {'PROJECT_ID': np.random.randint(1, 100),
'EXPERIMENT_ID': np.random.randint(1, 100)}

return metadata_file


def _make_fov_ids(num_fovs):
all_fovs = np.random.randint(low=1, high=num_fovs * 10, size=num_fovs)
fovs = ['fov_{}'.format(i) for i in all_fovs]

return fovs


def _make_exp_metadata(num_fovs):
fovs = _make_fov_ids(num_fovs)
raw_metadata = _make_raw_metadata()

metadata = pd.DataFrame({'image_name': fovs, 'EXPERIMENT_ID': raw_metadata['EXPERIMENT_ID'],
'status': 'awaiting_prediction', 'job_folder': 'NA'})

return metadata


def test_create_experiment_folder():
image_names = _make_fov_ids(10)
metadata = _make_raw_metadata()

with tempfile.TemporaryDirectory() as temp_dir:
experiment_folder = pipeline.create_experiment_folder(image_names=image_names,
raw_metadata=metadata,
base_dir=temp_dir)

saved_metadata = pd.read_csv(os.path.join(experiment_folder, 'metadata.csv'))

assert np.all(np.isin(saved_metadata['image_name'], image_names))
assert saved_metadata.loc[0, 'EXPERIMENT_ID'] == metadata['EXPERIMENT_ID']


def test_create_job_folder():
metadata = _make_exp_metadata(10)
fov_names = metadata['image_name'].values
fov_data = np.zeros((len(fov_names), 20, 20, 3))
fov_num = 7

with tempfile.TemporaryDirectory() as temp_dir:
pipeline.create_job_folder(temp_dir, metadata, fov_data, fov_names, fov_num)

saved_metadata = pd.read_csv(os.path.join(temp_dir, 'metadata.csv'))
new_status = saved_metadata.loc[np.isin(saved_metadata.image_name, fov_names[:fov_num]),
'status']

assert np.all(np.isin(new_status, 'in_progress'))
59 changes: 59 additions & 0 deletions caliban_toolbox/utils/pipeline_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# Copyright 2016-2020 The Van Valen Lab at the California Institute of
# Technology (Caltech), with support from the Paul Allen Family Foundation,
# Google, & National Institutes of Health (NIH) under Grant U24CA224309-01.
# All rights reserved.
#
# Licensed under a modified Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.github.com/vanvalenlab/caliban-toolbox/LICENSE
#
# The Work provided may be used for non-commercial academic purposes only.
# For any other use of the Work, including commercial use, please contact:
# vanvalenlab@gmail.com
#
# Neither the name of Caltech nor the names of its contributors may be used
# to endorse or promote products derived from this software without specific
# prior written permission.
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

import os
import json

import numpy as np

from caliban_toolbox import metadata


def get_job_folder_name(experiment_dir):
"""Identify the name for next sequentially named job folder
Args:
experiment_dir: full path to directory of current experiment
Returns:
string: full path to newly created job folder
string: name of the job folder
"""

files = os.listdir(experiment_dir)
folders = [file for file in files if os.path.isdir(os.path.join(experiment_dir, file))]
folders = [folder for folder in folders if 'caliban_job_' in folder]
folders.sort()

if len(folders) == 0:
new_folder = 'caliban_job_0'
else:
latest_folder_num = folders[-1].split('caliban_job_')[1]
new_folder = 'caliban_job_{}'.format(int(latest_folder_num) + 1)

new_folder_path = os.path.join(experiment_dir, new_folder)

return new_folder_path, new_folder

0 comments on commit 7ac835e

Please sign in to comment.