In [1]:
# This notebook trains models on specific tissues and platforms from the TissueNet dataset
import os
import errno
import numpy as np 
import deepcell
from deepcell_toolbox.multiplex_utils import multiplex_preprocess

In [2]:
# create folder for this set of experiments
experiment_folder = "specialist_benchmarking/"
MODEL_DIR = os.path.join("/data/analyses", experiment_folder)
NPZ_DIR = '/data/npz_data/20201018_freeze/'
LOG_DIR = '/data/logs'

if not os.path.isdir(MODEL_DIR):
    os.makedirs(MODEL_DIR)

In [3]:
from deepcell.utils.data_utils import get_data
from skimage.segmentation import relabel_sequential

npz_name = "20201018_multiplex_seed_3"

train_dict = np.load(NPZ_DIR + npz_name + "_train_512x512.npz")
val_dict = np.load(NPZ_DIR + npz_name + "_val_256x256.npz")

X_train, y_train = train_dict['X'], train_dict['y']
train_tissue, train_platform = train_dict['tissue_list'], train_dict['platform_list']
X_val, y_val = val_dict['X'], val_dict['y']
val_tissue, val_platform = val_dict['tissue_list'], val_dict['platform_list']

In [4]:
X_train = multiplex_preprocess(X_train)
X_val = multiplex_preprocess(X_val)

In [5]:
tissue_subset = False
tissues = ['breast', 'gi', 'immune', 'pancreas', 'all']
platforms = ['codex', 'cycif', 'mibi', 'vectra']

if tissue_subset:
    train_subset_list = train_tissue
    val_subset_list = val_tissue
    subsets = tissues
else:
    train_subset_list = train_platform
    val_subset_list = val_platform
    subsets = platforms

In [None]:
from deepcell.model_zoo.panopticnet import PanopticNet
from tensorflow.keras.optimizers import SGD, Adam
from deepcell.utils.train_utils import rate_scheduler
from deepcell import image_generators
from deepcell.utils import train_utils
from tensorflow.python.keras.losses import MSE
from deepcell import losses
from deepcell.utils.train_utils import get_callbacks
from deepcell.utils.train_utils import count_gpus


for current_subset in subsets:
    print("Training model for {}".format(current_subset))
    
    # initialize new model
    new_model = PanopticNet(
        backbone='resnet50',
        input_shape=(256, 256, 2),
        norm_method=None,
        num_semantic_heads=2,
        num_semantic_classes=[1, 3], # inner distance, pixelwise
        location=True,  # should always be true
        include_top=True)
    
    if current_subset == 'all':
        X_train_subset, y_train_subset = X_train, y_train
        X_val_subset, y_val_subset = X_val, y_val
        print("There are {} images in the train subset and {} in the val subset".format(X_train.shape[0], 
                                                                                        X_val.shape[0]))
    else:
        subset_idx_train = np.isin(train_subset_list, current_subset)
        subset_idx_val = np.isin(val_subset_list, current_subset)
        print("There are {} images in the train subset and {} in the val subset".format(np.sum(subset_idx_train), 
                                                                                        np.sum(subset_idx_val)))

        # subset training dict for current subset
        X_train_subset, y_train_subset = X_train[subset_idx_train], y_train[subset_idx_train]
        X_val_subset, y_val_subset = X_val[subset_idx_val], y_val[subset_idx_val]
    
    # set up training parameters
    model_name = npz_name + '_subset_100_' + current_subset
    n_epoch = 100  # Number of training epochs

    optimizer = Adam(lr=1e-4, clipnorm=0.001)
    lr_sched = rate_scheduler(lr=1e-4, decay=0.99)

    batch_size = 8

    min_objects = 0  # throw out images with fewer than this many objects
    seed=0
    print("Model name is {}".format(model_name))
    
    # create augmented dataset
    datagen = image_generators.CroppingDataGenerator(
        rotation_range=180,
        shear_range=0,
        zoom_range=(0.75, 1.25),
        horizontal_flip=True,
        vertical_flip=True,
        crop_size=(256, 256),
        float_dtype='float16',
        int_dtype='int16')

    datagen_val = image_generators.SemanticDataGenerator(
        rotation_range=0,
        shear_range=0,
        zoom_range=0,
        horizontal_flip=0,
        vertical_flip=0,
        float_dtype='float16',
        int_dtype='int16')

    train_data = datagen.flow(
        {'X': X_train_subset, 'y': y_train_subset},
        seed=seed,
        transforms=['inner-distance', 'pixelwise'],
        transforms_kwargs={'pixelwise':{'dilation_radius': 1}, 
                          'inner-distance': {'erosion_width': 1, 'alpha': 'auto'}},
        min_objects=min_objects,
        batch_size=batch_size)

    val_data = datagen_val.flow(
        {'X': X_val_subset, 'y': y_val_subset},
        seed=seed,
        transforms=['inner-distance', 'pixelwise'],
        transforms_kwargs={'pixelwise':{'dilation_radius': 1},
                          'inner-distance': {'erosion_width': 1, 'alpha': 'auto'}},
        min_objects=min_objects,
        batch_size=batch_size)
    
    print('generators created')
    
    # set up losses
    def semantic_loss(n_classes):
        def _semantic_loss(y_pred, y_true):
            if n_classes > 1:
                return 0.01 * losses.weighted_categorical_crossentropy(
                    y_pred, y_true, n_classes=n_classes)
            return MSE(y_pred, y_true)
        return _semantic_loss


    loss = {}

    # Give losses for all of the semantic heads
    for layer in new_model.layers:
        if layer.name.startswith('semantic_'):
            n_classes = layer.output_shape[-1]
            loss[layer.name] = semantic_loss(n_classes)
            
    # compile model
    new_model.compile(loss=loss, optimizer=optimizer)
    
    # train model
    model_path = os.path.join(MODEL_DIR, '{}.h5'.format(model_name))
    loss_path = os.path.join(MODEL_DIR, '{}.npz'.format(model_name))

    num_gpus = count_gpus()

    print('Training on', num_gpus, 'GPUs.')

    train_callbacks = get_callbacks(
        model_path,
        lr_sched=lr_sched,
        tensorboard_log_dir=LOG_DIR,
        save_weights_only=num_gpus >= 2,
        monitor='val_loss',
        verbose=1)

    loss_history = new_model.fit_generator(
        train_data,
        steps_per_epoch=333,
        epochs=n_epoch,
        validation_data=val_data,
        validation_steps=val_data.y.shape[0] // batch_size,
        callbacks=train_callbacks)


Training model for codex


W1024 01:18:58.352425 139667778570048 deprecation.py:506] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Downloading data from https://github.com/keras-team/keras-applications/releases/download/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
There are 858 images in the train subset and 1041 in the val subset


W1024 01:19:32.061852 139667778570048 semantic.py:111] X data dtype is float32: this will increase memory use during preprocessing. Consider using a smaller dtype


Model name is 20201018_multiplex_seed_3_subset_100_codex
