Merge 582249c into 23d9e2a

vanvalenlab · Jun 7, 2020 · 7ac835e · 7ac835e
2 parents 23d9e2a + 582249c
commit 7ac835e
Show file tree

Hide file tree

Showing 8 changed files with 465 additions and 2 deletions.
diff --git a/caliban_toolbox/metadata.py b/caliban_toolbox/metadata.py
@@ -0,0 +1,79 @@
+# Copyright 2016-2020 The Van Valen Lab at the California Institute of
+# Technology (Caltech), with support from the Paul Allen Family Foundation,
+# Google, & National Institutes of Health (NIH) under Grant U24CA224309-01.
+# All rights reserved.
+#
+# Licensed under a modified Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.github.com/vanvalenlab/caliban-toolbox/LICENSE
+#
+# The Work provided may be used for non-commercial academic purposes only.
+# For any other use of the Work, including commercial use, please contact:
+# vanvalenlab@gmail.com
+#
+# Neither the name of Caltech nor the names of its contributors may be used
+# to endorse or promote products derived from this software without specific
+# prior written permission.
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+import pandas as pd
+import numpy as np
+
+
+def make_experiment_metadata_file(raw_metadata, image_names):
+    """Creates a metadata file for a specific experiment
+
+    Args:
+        raw_metadata: metadata file from the raw ontology
+        image_names: names of images that are being processed
+
+    Returns:
+        pd.DataFrame: metadata file
+    """
+
+    experiment_metadata = pd.DataFrame({'PROJECT_ID': raw_metadata['PROJECT_ID'],
+                                        'EXPERIMENT_ID': raw_metadata['EXPERIMENT_ID'],
+                                        'image_name': image_names,
+                                        'job_folder': 'NA',
+                                        'job_id': 'NA',
+                                        'status': 'awaiting_prediction'
+                                        })
+
+    return experiment_metadata
+
+
+def update_job_metadata(metadata, update_dict):
+    """Updates a metadata for a specific job
+
+    Args:
+        metadata: the metadata file to be updated
+        update_dict: the dictionary containing the update stats for the job
+
+    Returns:
+        pd.DataFrame: updated metadata file
+    """
+
+    # TODO: check that these images belong to specific job
+    # TODO: figure out workflow for remaining in progress jobs
+
+    in_progress = metadata.loc[metadata.status == 'in_progress', 'image_name']
+    included, excluded = update_dict['included'], update_dict['excluded']
+
+    # make sure supplied excluded and included images are in progress for this job
+    if not np.all(np.isin(included, in_progress)):
+        raise ValueError('Invalid fovs supplied')
+
+    if not np.all(np.isin(excluded, in_progress)):
+        raise ValueError('Invalid fovs supplied')
+
+    metadata.loc[np.isin(metadata.image_name, included), 'status'] = 'included'
+    metadata.loc[np.isin(metadata.image_name, excluded), 'status'] = 'excluded'
+
+    return metadata
diff --git a/caliban_toolbox/metadata_test.py b/caliban_toolbox/metadata_test.py
@@ -0,0 +1,79 @@
+# Copyright 2016-2020 The Van Valen Lab at the California Institute of
+# Technology (Caltech), with support from the Paul Allen Family Foundation,
+# Google, & National Institutes of Health (NIH) under Grant U24CA224309-01.
+# All rights reserved.
+#
+# Licensed under a modified Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.github.com/vanvalenlab/caliban-toolbox/LICENSE
+#
+# The Work provided may be used for non-commercial academic purposes only.
+# For any other use of the Work, including commercial use, please contact:
+# vanvalenlab@gmail.com
+#
+# Neither the name of Caltech nor the names of its contributors may be used
+# to endorse or promote products derived from this software without specific
+# prior written permission.
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+import numpy as np
+
+from caliban_toolbox import metadata
+import importlib
+importlib.reload(metadata)
+
+
+def _make_raw_metadata():
+    metadata_file = {'PROJECT_ID': np.random.randint(1, 100),
+                     'EXPERIMENT_ID': np.random.randint(1, 100)}
+
+    return metadata_file
+
+
+def _make_fov_ids(num_fovs):
+    all_fovs = np.random.randint(low=1, high=num_fovs * 10, size=num_fovs)
+    fovs = ['fov_{}'.format(i) for i in all_fovs]
+
+    return fovs
+
+
+def test_make_experiment_metadata_file():
+    raw_metadata = _make_raw_metadata()
+    image_names = _make_fov_ids(10)
+    experiment_metadata = metadata.make_experiment_metadata_file(raw_metadata, image_names)
+
+    assert experiment_metadata.loc[0, 'PROJECT_ID'] == raw_metadata['PROJECT_ID']
+    assert experiment_metadata.loc[0, 'EXPERIMENT_ID'] == raw_metadata['EXPERIMENT_ID']
+    assert np.all(np.isin(image_names, experiment_metadata['image_name']))
+
+
+def test_update_job_metadata():
+    raw_metadata = _make_raw_metadata()
+    image_names = _make_fov_ids(10)
+    experiment_metadata = metadata.make_experiment_metadata_file(raw_metadata, image_names)
+    experiment_metadata['status'] = 'in_progress'
+
+    included_images = image_names[:6]
+    excluded_images = image_names[6:8]
+    in_process = image_names[8:]
+
+    updated_metadata = metadata.update_job_metadata(metadata=experiment_metadata,
+                                                    update_dict={'included': included_images,
+                                                                 'excluded': excluded_images})
+    pred_included = updated_metadata.loc[updated_metadata.status == 'included', 'image_name']
+    assert np.all(np.isin(pred_included, included_images))
+
+    pred_excluded = updated_metadata.loc[updated_metadata.status == 'excluded', 'image_name']
+    assert np.all(np.isin(pred_excluded, excluded_images))
+
+    pred_in_progress = updated_metadata.loc[updated_metadata.status == 'awaiting_prediction',
+                                            'image_name']
+
+    assert np.all(np.isin(pred_in_progress, in_process))
diff --git a/caliban_toolbox/pipeline.py b/caliban_toolbox/pipeline.py
@@ -0,0 +1,85 @@
+# Copyright 2016-2020 The Van Valen Lab at the California Institute of
+# Technology (Caltech), with support from the Paul Allen Family Foundation,
+# Google, & National Institutes of Health (NIH) under Grant U24CA224309-01.
+# All rights reserved.
+#
+# Licensed under a modified Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.github.com/vanvalenlab/caliban-toolbox/LICENSE
+#
+# The Work provided may be used for non-commercial academic purposes only.
+# For any other use of the Work, including commercial use, please contact:
+# vanvalenlab@gmail.com
+#
+# Neither the name of Caltech nor the names of its contributors may be used
+# to endorse or promote products derived from this software without specific
+# prior written permission.
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import os
+
+import numpy as np
+import pandas as pd
+import xarray as xr
+
+from caliban_toolbox import metadata
+from caliban_toolbox.utils.pipeline_utils import get_job_folder_name
+
+
+def create_experiment_folder(image_names, raw_metadata, base_dir):
+    """Takes the output of the data loader and creates an experiment folder
+
+    Args:
+        image_names: names of images from current experiment
+        raw_metadata: metadata file from raw ontology
+        base_dir: directory where experiment folder will be created
+
+    Returns:
+        string: full path to newly created experiment folder
+    """
+
+    experiment_id = raw_metadata['EXPERIMENT_ID']
+    experiment_folder = os.path.join(base_dir, 'experiment_{}'.format(experiment_id))
+    os.makedirs(experiment_folder)
+
+    # create metadata file
+    exp_metadata = metadata.make_experiment_metadata_file(raw_metadata, image_names)
+
+    # save metadata file
+    exp_metadata.to_csv(os.path.join(experiment_folder, 'metadata.csv'))
+
+    return experiment_folder
+
+
+def create_job_folder(experiment_dir, metadata, fov_data, fov_names, fov_num):
+    """Creates a folder to hold a single caliban job
+
+    Args:
+        experiment_dir: directory of relevant experiment
+        fov_num: number of FOVs to include in job
+    """
+
+    # Create sequentially named job folder
+    job_folder_path, job_name = get_job_folder_name(experiment_dir)
+    os.makedirs(job_folder_path)
+
+    available_fovs = metadata[metadata['status'] == 'awaiting_prediction']
+    new_fov_names = available_fovs['image_name'][:fov_num].values
+
+    metadata.loc[metadata['image_name'].isin(new_fov_names),
+                 ['status', 'job_folder']] = 'in_progress', job_name
+
+    fov_idx = np.isin(fov_names, new_fov_names)
+
+    new_fov_data = fov_data[fov_idx]
+
+    np.savez(os.path.join(job_folder_path, 'raw_data.npz'), X=new_fov_data)
+    metadata.to_csv(os.path.join(experiment_dir, 'metadata.csv'))
diff --git a/caliban_toolbox/pipeline_test.py b/caliban_toolbox/pipeline_test.py
@@ -0,0 +1,90 @@
+# Copyright 2016-2020 The Van Valen Lab at the California Institute of
+# Technology (Caltech), with support from the Paul Allen Family Foundation,
+# Google, & National Institutes of Health (NIH) under Grant U24CA224309-01.
+# All rights reserved.
+#
+# Licensed under a modified Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.github.com/vanvalenlab/caliban-toolbox/LICENSE
+#
+# The Work provided may be used for non-commercial academic purposes only.
+# For any other use of the Work, including commercial use, please contact:
+# vanvalenlab@gmail.com
+#
+# Neither the name of Caltech nor the names of its contributors may be used
+# to endorse or promote products derived from this software without specific
+# prior written permission.
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+import os
+import tempfile
+
+import numpy as np
+import pandas as pd
+import xarray as xr
+
+from caliban_toolbox import pipeline
+import importlib
+importlib.reload(pipeline)
+
+
+def _make_raw_metadata():
+    metadata_file = {'PROJECT_ID': np.random.randint(1, 100),
+                     'EXPERIMENT_ID': np.random.randint(1, 100)}
+
+    return metadata_file
+
+
+def _make_fov_ids(num_fovs):
+    all_fovs = np.random.randint(low=1, high=num_fovs * 10, size=num_fovs)
+    fovs = ['fov_{}'.format(i) for i in all_fovs]
+
+    return fovs
+
+
+def _make_exp_metadata(num_fovs):
+    fovs = _make_fov_ids(num_fovs)
+    raw_metadata = _make_raw_metadata()
+
+    metadata = pd.DataFrame({'image_name': fovs, 'EXPERIMENT_ID': raw_metadata['EXPERIMENT_ID'],
+                             'status': 'awaiting_prediction', 'job_folder': 'NA'})
+
+    return metadata
+
+
+def test_create_experiment_folder():
+    image_names = _make_fov_ids(10)
+    metadata = _make_raw_metadata()
+
+    with tempfile.TemporaryDirectory() as temp_dir:
+        experiment_folder = pipeline.create_experiment_folder(image_names=image_names,
+                                                              raw_metadata=metadata,
+                                                              base_dir=temp_dir)
+
+        saved_metadata = pd.read_csv(os.path.join(experiment_folder, 'metadata.csv'))
+
+        assert np.all(np.isin(saved_metadata['image_name'], image_names))
+        assert saved_metadata.loc[0, 'EXPERIMENT_ID'] == metadata['EXPERIMENT_ID']
+
+
+def test_create_job_folder():
+    metadata = _make_exp_metadata(10)
+    fov_names = metadata['image_name'].values
+    fov_data = np.zeros((len(fov_names), 20, 20, 3))
+    fov_num = 7
+
+    with tempfile.TemporaryDirectory() as temp_dir:
+        pipeline.create_job_folder(temp_dir, metadata, fov_data, fov_names, fov_num)
+
+        saved_metadata = pd.read_csv(os.path.join(temp_dir, 'metadata.csv'))
+        new_status = saved_metadata.loc[np.isin(saved_metadata.image_name, fov_names[:fov_num]),
+                                        'status']
+
+        assert np.all(np.isin(new_status, 'in_progress'))
diff --git a/caliban_toolbox/utils/pipeline_utils.py b/caliban_toolbox/utils/pipeline_utils.py
@@ -0,0 +1,59 @@
+# Copyright 2016-2020 The Van Valen Lab at the California Institute of
+# Technology (Caltech), with support from the Paul Allen Family Foundation,
+# Google, & National Institutes of Health (NIH) under Grant U24CA224309-01.
+# All rights reserved.
+#
+# Licensed under a modified Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.github.com/vanvalenlab/caliban-toolbox/LICENSE
+#
+# The Work provided may be used for non-commercial academic purposes only.
+# For any other use of the Work, including commercial use, please contact:
+# vanvalenlab@gmail.com
+#
+# Neither the name of Caltech nor the names of its contributors may be used
+# to endorse or promote products derived from this software without specific
+# prior written permission.
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import os
+import json
+
+import numpy as np
+
+from caliban_toolbox import metadata
+
+
+def get_job_folder_name(experiment_dir):
+    """Identify the name for next sequentially named job folder
+
+    Args:
+        experiment_dir: full path to directory of current experiment
+
+    Returns:
+        string: full path to newly created job folder
+        string: name of the job folder
+    """
+
+    files = os.listdir(experiment_dir)
+    folders = [file for file in files if os.path.isdir(os.path.join(experiment_dir, file))]
+    folders = [folder for folder in folders if 'caliban_job_' in folder]
+    folders.sort()
+
+    if len(folders) == 0:
+        new_folder = 'caliban_job_0'
+    else:
+        latest_folder_num = folders[-1].split('caliban_job_')[1]
+        new_folder = 'caliban_job_{}'.format(int(latest_folder_num) + 1)
+
+    new_folder_path = os.path.join(experiment_dir, new_folder)
+
+    return new_folder_path, new_folder