In [25]:
from campa.constants import EXPERIMENT_DIR, get_data_config
from pathlib import Path
import shutil
import os
import json

EXPERIMENT_DIR = "/Users/hannah.spitzer/projects/pelkmans/software_new/campa/tests/_experiments"

def prepare_test_experiment(name, cluster_subset=False, full_data_prediction=False, full_data_clustering=False):
    """
    Copy reference_experiment with the specified components to name
    """
    from_dir = Path(EXPERIMENT_DIR)/"reference_experiment/cVAE"
    to_dir = Path(EXPERIMENT_DIR)/"test_experiment"/name
    # delete to_dir if it exists
    if to_dir.exists():
        shutil.rmtree(to_dir)
    to_dir.mkdir(parents=True, exist_ok=True)

    # copy model
    files_to_copy = []
    dirs_to_copy = []
    # copy model
    files_to_copy.extend(["checkpoint", "config.json", "history.csv", 
        "weights_epoch005.data-00000-of-00001", 'weights_epoch005.index'])
    dirs_to_copy.append('results_epoch005')
    if cluster_subset:
        dirs_to_copy.extend(["aggregated/sub-0.1"])
    if full_data_prediction:
        full_data_base = Path("aggregated/full_data")
        full_data_files = ['channels.csv', 'latent.npy', 'metadata.csv', 'mpp_params.json', 'obj_ids.npy', 'x.npy', 'y.npy']
        for data_dir in ["184A1_unperturbed/I09", "184A1_meayamycin/I12"]:
            files_to_copy.extend([full_data_base/data_dir/f for f in full_data_files])
    if full_data_clustering:
        full_data_base = Path("aggregated/full_data")
        full_data_files = ['clustering.npy']
        for data_dir in ["184A1_unperturbed/I09", "184A1_meayamycin/I12"]:
            files_to_copy.extend([full_data_base/data_dir/f for f in full_data_files])
    
    # ensure dirs exist 
    for data_dir in ["184A1_unperturbed/I09", "184A1_meayamycin/I12"]:
        (to_dir/"aggregated/full_data"/data_dir).mkdir(parents=True, exist_ok=True)
    # copy files
    for f in files_to_copy:
        shutil.copy(from_dir/f, to_dir/f)
    # copy dirs
    for f in dirs_to_copy:
        shutil.copytree(from_dir/f, to_dir/f)

    # correct experiment dir + name in config.json
    config = json.load(open(to_dir/"config.json", 'r'))
    config['experiment']['dir'] = "test_experiment"
    config['experiment']['name'] = name
    json.dump(config, open(to_dir/"config.json", 'w'), indent=4)

prepare_test_experiment("exp1", cluster_subset=True, full_data_prediction=True, full_data_clustering=True)


In [1]:
from campa.tl import Experiment, run_experiments
from campa.data import create_dataset
from campa.tl import project_cluster_data, create_cluster_data, prepare_full_dataset, extract_features
from campa.data import NNDataset

def create_nn_dataset():
    # create test_dataset
    data_params = {
        'dataset_name': 'test_dataset',
        'data_config': "TestData",
        'data_dirs': 
                [os.path.join('184A1_unperturbed', well) for well in ['I09',]] + \
                [os.path.join('184A1_meayamycin', well) for well in ['I12',]],
        'channels': ['01_PABPC1', '03_CDK9', '09_SRRM2', '10_POL2RA_pS2', '11_PML',], 
        'condition': ['perturbation_duration_one_hot', 'cell_cycle_one_hot'],
        'condition_kwargs': {
            'cond_params': {}
        },
        'split_kwargs': {
            'train_frac': 0.35,
            'val_frac': 0.35,
        },
        'test_img_size': 225,
        'subset': True,
        'subset_kwargs': {
            'frac': None,
            'nona_condition': True,
            'cell_cycle': 'NO_NAN'
        },
        'subsample': True,
        'subsample_kwargs': {
            'frac': None,
            'frac_per_obj': None,
            'num': None,
            'num_per_obj': 100,
        },
        'neighborhood': True,
        'neighborhood_size': 3,
        'normalise': True,
        'normalise_kwargs': {
            'background_value': 'mean_background',
            'percentile': 98.0,
            'rescale_values': [],
        },
        'seed': 42,
    }
    create_dataset(data_params)


Reading config from /Users/hannah.spitzer/projects/pelkmans/software_new/campa/tests/campa.ini


  doc = func(self, args[0].__doc__, *args[1:], **kwargs)


In [14]:
def test_nn_dataset():
    create_nn_dataset()
    test_ds = NNDataset("test_dataset", data_config='TestData')
    reference_ds = NNDataset("reference_dataset", data_config='TestData')

    # compare test and reference ds
    for split in ['train', 'val', 'test']:
        assert test_ds.data[split].compare(reference_ds.data[split])[0]
    for split in ['val', 'test']:
        assert test_ds.imgs[split].compare(reference_ds.imgs[split])[0]

test_nn_dataset()

In [22]:
import string
from campa.tl import Estimator, Cluster
import random
def id_generator(size=6, chars=string.ascii_uppercase + string.digits):
    return ''.join(random.choice(chars) for _ in range(size))

def train_model(model_name):
    from campa.tl import LossEnum, ModelEnum
    experiment_config = {
        "experiment": {
            "dir": "test_experiment",
            "name": model_name,
            "save_config": True,
        },
        "data": {
            "data_config": "TestData",
            "dataset_name": "reference_dataset",
            "output_channels": None,
        },
        "model": {
            "model_cls":  ModelEnum.VAEModel,
            "model_kwargs": {
                "num_neighbors": 3,
                "num_channels": 5,
                "num_output_channels": 5,
                "latent_dim": 4,
                # encoder definition
                "encoder_conv_layers": [16],
                "encoder_conv_kernel_size": [1],
                "encoder_fc_layers": [8],
                # decoder definition
                "decoder_fc_layers": [],
                "num_conditions": 6,
                "encode_condition": [6],
            },
            # if true, looks for saved weights in experiment_dir
            # if a path, loads these weights
            "init_with_weights": False,
        },
        "training": {
            "learning_rate": 0.001,
            "epochs": 5,
            "batch_size": 128,
            "loss": {"decoder": LossEnum.SIGMA_MSE, "latent": LossEnum.KL},
            "metrics": {"decoder": LossEnum.MSE_metric, "latent": LossEnum.KL},
            # saving models
            "save_model_weights": True,
            "save_history": True,
            "overwrite_history": True,
        },
        "evaluation": {
            "split": "val",
            "predict_reps": ["latent", "decoder"],
            "img_ids": 1,
            "predict_imgs": True,
            "predict_cluster_imgs": True,
        },
        "cluster": {  # cluster config, also used in this format for whole data clustering
            "cluster_name": "clustering",
            "cluster_rep": "latent",
            "cluster_method": "leiden",  # leiden or kmeans
            "leiden_resolution": 0.2,
            "subsample": True,  # 'subsample' or 'som'
            "subsample_kwargs": {'frac': 0.1},
            "som_kwargs": {},
            "umap": True,
        },
    }

    exp = Experiment(experiment_config)
    run_experiments([exp], mode='trainval')

def test_model_training():
    model_name = id_generator(size=6)
    #model_name = "SASY21"
    train_model(model_name)

    # check if all expected files are created
    exp = Experiment.from_dir("test_experiment/"+model_name)
    exp.set_to_evaluate()
    _ = Estimator(exp)
    

test_model_training()



In [42]:
from campa.tl import Cluster
def cluster(model_name):
    prepare_test_experiment(model_name, cluster_subset=False, full_data_prediction=False, full_data_clustering=False)
    create_cluster_data("test_experiment/"+model_name, subsample=True, frac=0.1, save_dir="aggregated/sub-0.1", cluster=True)

def test_cluster_subset():
    model_name = id_generator(size=6)
    #model_name = 'VZRH8X'
    cluster(model_name)

    test_cl = Cluster.from_cluster_data_dir("test_experiment/"+model_name+"/aggregated/sub-0.1")
    reference_cl = Cluster.from_cluster_data_dir("reference_experiment/cVAE/aggregated/sub-0.1")

    comp = test_cl.cluster_mpp.compare(reference_cl.cluster_mpp)[1]
    assert comp['x']
    assert comp['y']
    assert comp['obj_ids']
    assert comp['mpp']

    import numpy as np
    print(test_cl.cluster_mpp.data('latent'))
    print(reference_cl.cluster_mpp.data('latent'))
    # TODO why is latent not similar?
    print(np.isclose(test_cl.cluster_mpp.data('latent'), reference_cl.cluster_mpp.data('latent')))

    assert test_cl.cluster_annotation is not None
    test_cl.cluster_mpp.get_adata(X='mpp', obs=['clustering'], obsm={"X_latent": "latent", "X_umap": "umap"})

test_cluster_subset()

Could not load MPPData from test_experiment/8SXVWJ/aggregated/sub-0.1
Could not load MPPData from test_experiment/8SXVWJ/aggregated/sub-0.1
Saving partial keys of mpp data without a base_data_dir to enable correct loading




Saving partial keys of mpp data without a base_data_dir to enable correct loading


Cannot read with memmap:  /Users/hannah.spitzer/projects/pelkmans/software_new/campa/tests/_experiments/test_experiment/8SXVWJ/aggregated/sub-0.1/clustering.npy
Cannot read with memmap:  /Users/hannah.spitzer/projects/pelkmans/software_new/campa/tests/_experiments/reference_experiment/cVAE/aggregated/sub-0.1/clustering.npy
[[ 0.7025145   1.0100106   0.3618012  -0.54160935]
 [ 3.0500026   0.01753122  0.05046673 -1.4755267 ]
 [-0.24107969 -1.0978781  -0.4569826   0.30672166]
 ...
 [ 0.2739511  -0.49121684  0.92969525 -1.496013  ]
 [ 1.2105104  -0.59349877 -1.1900173   0.52722204]
 [ 0.3490503  -1.6331238   2.2730925  -0.9045105 ]]
[[ 0.5271177   0.1882852  -1.0454091  -0.5767247 ]
 [ 1.8563808  -1.482907   -1.2025827   0.43522835]
 [ 0.89105386  0.32178292 -0.39630467  1.3699696 ]
 ...
 [ 0.5740625  -0.50714755 -0.652853    0.30780938]
 [ 0.46897215  0.5794399  -0.2598577   0.9203548 ]
 [ 0.12242371 -0.5240741  -0.42662796 -0.7542753 ]]
[[False False False False]
 [False False False Fals



In [40]:
from campa.data import MPPData
import numpy as np

def test_predict_full_data():
    model_name = id_generator(size=6)
    prepare_test_experiment(model_name, cluster_subset=True, full_data_prediction=False, full_data_clustering=False)

    # predict full data
    prepare_full_dataset("test_experiment/"+model_name, save_dir="aggregated/full_data")

    # check results
    for data_dir in ["184A1_unperturbed/I09", "184A1_meayamycin/I12"]:
        # load mpp_data with cluster_rep
        test_mpp_data = MPPData.from_data_dir(
            data_dir,
            base_dir=os.path.join(EXPERIMENT_DIR, "test_experiment", model_name, "aggregated/full_data"),
            keys=["x", "y", "obj_ids", "latent"],
            data_config="TestData",
        )
        reference_mpp_data = MPPData.from_data_dir(
            data_dir,
            base_dir=os.path.join(EXPERIMENT_DIR, "test_experiment", model_name, "aggregated/full_data"),
            keys=["x", "y", "obj_ids", "latent"],
            data_config="TestData",
        )

        comp = test_mpp_data.compare(reference_mpp_data)[1]
        assert comp['x']
        assert comp['y']
        assert comp['obj_ids']
        assert comp['mpp']
        assert (np.isclose(test_mpp_data.data('latent'), reference_mpp_data.data('latent')).all())
        #print(test_mpp_data.data('latent'))
        #print(reference_mpp_data.data('latent'))

test_predict_full_data()



In [41]:
def test_cluster_full_data():
    model_name = id_generator(size=6)
    prepare_test_experiment(model_name, cluster_subset=True, full_data_prediction=True, full_data_clustering=False)

    # predict full data
    project_cluster_data(
        "test_experiment/"+model_name,
        cluster_data_dir="aggregated/sub-0.1",
        cluster_name="clustering",
        save_dir="aggregated/full_data",
    )

    # check results
    for data_dir in ["184A1_unperturbed/I09", "184A1_meayamycin/I12"]:
        # load mpp_data with cluster_rep
        test_mpp_data = MPPData.from_data_dir(
            data_dir,
            base_dir=os.path.join(EXPERIMENT_DIR, "test_experiment", model_name, "aggregated/full_data"),
            keys=["x", "y", "obj_ids", "clustering"],
            data_config="TestData",
        )
        reference_mpp_data = MPPData.from_data_dir(
            data_dir,
            base_dir=os.path.join(EXPERIMENT_DIR, "test_experiment", model_name, "aggregated/full_data"),
            keys=["x", "y", "obj_ids", "clustering"],
            data_config="TestData",
        )

        assert test_mpp_data.compare(reference_mpp_data)[0]
        

test_cluster_full_data()

Cannot read with memmap:  /Users/hannah.spitzer/projects/pelkmans/software_new/campa/tests/_experiments/reference_experiment/cVAE/aggregated/sub-0.1/clustering.npy


Saving partial keys of mpp data without a base_data_dir to enable correct loading
Saving partial keys of mpp data without a base_data_dir to enable correct loading


Cannot read with memmap:  /Users/hannah.spitzer/projects/pelkmans/software_new/campa/tests/_experiments/test_experiment/C3PW86/aggregated/full_data/184A1_unperturbed/I09/clustering.npy
Cannot read with memmap:  /Users/hannah.spitzer/projects/pelkmans/software_new/campa/tests/_experiments/test_experiment/C3PW86/aggregated/full_data/184A1_unperturbed/I09/clustering.npy
{'x': True, 'y': True, 'obj_ids': True, 'labels': True, 'mpp': True, 'clustering': True, 'latent': True, 'channels': True, 'metadata': True}
Cannot read with memmap:  /Users/hannah.spitzer/projects/pelkmans/software_new/campa/tests/_experiments/test_experiment/C3PW86/aggregated/full_data/184A1_meayamycin/I12/clustering.npy
Cannot read with memmap:  /Users/hannah.spitzer/projects/pelkmans/software_new/campa/tests/_experiments/test_experiment/C3PW86/aggregated/full_data/184A1_meayamycin/I12/clustering.npy
{'x': True, 'y': True, 'obj_ids': True, 'labels': True, 'mpp': True, 'clustering': True, 'latent': True, 'channels': True

In [5]:
def test_extract_features():
    pass
    #TODO test features

In [9]:
test_ds.data['train'].compare(reference_ds.data['train'])

(True,
 {'x': True,
  'y': True,
  'obj_ids': True,
  'mpp': True,
  'labels': True,
  'conditions': True,
  'channels': True,
  'metadata': True})

In [6]:
mpp_data.compare?

[0;31mSignature:[0m [0mmpp_data[0m[0;34m.[0m[0mcompare[0m[0;34m([0m[0mobj[0m[0;34m:[0m [0;34m'MPPData'[0m[0;34m)[0m [0;34m->[0m [0mTuple[0m[0;34m[[0m[0mbool[0m[0;34m,[0m [0mDict[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mbool[0m[0;34m][0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m <no docstring>
[0;31mFile:[0m      ~/projects/pelkmans/software_new/campa/campa/data/_data.py
[0;31mType:[0m      method
