### Make Training Data (Movies/Images)

In [1]:
"""
make_training_data_tracking.py - for multiple sets of data with multiple parts and montages

Executing functions for creating npz files containing the training data
Functions will create training data for either
    - Patchwise sampling
    - Fully convolutional training of single image conv-nets
    - Fully convolutional training of movie conv-nets

Files should be placed in training directories with each separate
dataset getting its own folder

@author: David Van Valen
"""

"""
Import packages
"""
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import glob
import os
import pathlib
import skimage as sk
import scipy as sp
from scipy import ndimage
from skimage import feature
from skimage import morphology as morph
from skimage.transform import resize
from sklearn.utils import class_weight
from deepcell import get_image
from deepcell import make_training_data
# from deepcell import format_coord as cf

# Load data
direc_name = '/data/data/cells/3T3/NIH/set2/part_1/movie'
output_directory = '/data/npz_data/cells/3T3/NIH/movie'
file_name_save = os.path.join( output_directory, 'nuclear_movie_3T3_S2P1_same.npz')
# Training directories are organized according to location within an image
num_x = 7 # Define num of horizontal samples
num_y = 7 # Define num of vertical samples
samples_to_drop = ['00_00','00_01','00_06','01_00','02_05','03_01','03_06','04_00','05_00','05_06','06_02'] # Some movies/montages/samples do not contain cells or contain annotation errors
#samples_to_drop = []
# Build list of possible training directories (excluding those to be dropped)
training_direcs = ['0{}_0{}'.format(i,j) for i in range(num_x) for j in range(num_y)]
training_direcs = [x for x in training_direcs if x not in samples_to_drop]
channel_names = [""] # Commonality in raw filenames

# Create output ditrectory, if necessary
pathlib.Path(output_directory).mkdir(parents=True, exist_ok=True)

# Create the training data
make_training_data(
    direc_name = direc_name,
    file_name_save = file_name_save,
    channel_names = channel_names,
    dimensionality = 3,
    training_direcs = training_direcs,
    raw_image_direc = "raw",
    annotation_direc = "annotated",
    annotation_name = "",
    border_mode = "same",
    output_mode = "conv",
    num_frames = 30,
    reshape_size = None,
    verbose = True)


Using TensorFlow backend.


In [31]:
"""
make_training_data_tracking.py - for a single directory of data

Import packages
"""
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import glob
import os
import pathlib
import skimage as sk
import scipy as sp
from scipy import ndimage
from skimage import feature
from skimage import morphology as morph
from skimage.transform import resize
from sklearn.utils import class_weight
from deepcell import get_image
from deepcell import make_training_data

# Load data
#direc_name = '/data/data/cells/HEK293/generic/set0/deepcell_segmentations_v1'
#output_directory = '/data/data/cells/HEK293/generic/set0/deepcell_segmentations_v1/'
direc_name = '/data/data/cells/3T3/NIH/set0/deepcell_segmentations_v1/'
output_directory = '/data/data/cells/3T3/NIH/set0/deepcell_segmentations_v1/'


file_name_save = os.path.join(output_directory, 'nuclear_movie_3T3_s0p2_DCWS.npz')
# Build list of possible training directories (excluding those to be dropped)
training_direcs = ['part_2']
channel_names = [""] # Commonality in raw filenames

# Create output ditrectory, if necessary
pathlib.Path(output_directory).mkdir(parents=True, exist_ok=True)

# Create the training data
make_training_data(
    direc_name = direc_name,
    file_name_save = file_name_save,
    channel_names = channel_names,
    dimensionality = 3,
    training_direcs = training_direcs,
    raw_image_direc = "raw",
    annotation_direc = "annotated",
    annotation_name = "",
    output_mode = "conv",
    num_frames = 71,
    reshape_size = None,
    verbose = True)


Using TensorFlow backend.


In [9]:
# to compile multiple sets together

import numpy as np
import os

base_path = '/data/npz_data/cells/3T3/NIH/movie/nuclear_movie_3T3_'
#num_of_sets = 2
set_list = ['S0P1','S1P1','S1P2','S2P1','S2P2']

# Instantiate arrays to hold the final trading data and fill them
X_full = np.empty((0, 30, 154, 182, 1))
y_full = np.empty((0, 30, 154, 182, 1))
#for movie in range(num_of_sets):
for movie in set_list:
    path = os.path.join(base_path + str(movie) + '_same.npz')
    data = np.load(path)
    print(data.keys())
    X_to_load, y_to_load = data['X'], data['y']
    print('X Shape:', X_to_load.shape)
    print('y Shape:', y_to_load.shape)
    X_full = np.concatenate((X_full, X_to_load), axis=0)
    y_full = np.concatenate((y_full, y_to_load), axis=0)
    
# Save the result to a new npz
output_directory = '/data/npz_data/cells/3T3/NIH/movie/'
file_name_save = os.path.join( output_directory, 'nuclear_movie_3T3_allbutS0P2_same.npz')

np.savez(file_name_save, X=X_full, y=y_full)


['y', 'X']
X Shape: (44, 30, 154, 182, 1)
y Shape: (44, 30, 154, 182, 1)
['y', 'X']
X Shape: (44, 30, 154, 182, 1)
y Shape: (44, 30, 154, 182, 1)
['y', 'X']
X Shape: (36, 30, 154, 182, 1)
y Shape: (36, 30, 154, 182, 1)
['X', 'y']
X Shape: (38, 30, 154, 182, 1)
y Shape: (38, 30, 154, 182, 1)
['y', 'X']
X Shape: (45, 30, 154, 182, 1)
y Shape: (45, 30, 154, 182, 1)


In [53]:
# Verify the result
data = np.load('/data/npz_data/cells/3T3/NIH/movie/nuclear_movie_3T3_allbutS0P2_same.npz')
X_to_load, y_to_load = data['X'][()], data['y'][()]

print(data.keys())
data_readable_X, data_readable_y = data['X'][()], data['y'][()]
print('X Shape:', data_readable_X.shape)
print('y Shape:', data_readable_y.shape)

['X', 'y']
X Shape: (207, 30, 154, 182, 1)
y Shape: (207, 30, 154, 182, 1)


In [5]:
# Verify the result
data = np.load('/data/npz_data/cells/HeLa/S3/movie/nuclear_movie_hela0-7_same.npz')
X_to_load, y_to_load = data['X'][()], data['y'][()]

print(data.keys())
data_readable_X, data_readable_y = data['X'][()], data['y'][()]
print('X Shape:', data_readable_X.shape)
print('y Shape:', data_readable_y.shape)

['y', 'X']
X Shape: (180, 40, 216, 256, 1)
y Shape: (180, 40, 216, 256, 1)


In [55]:
# Verify the result
data = np.load('/data/data/cells/3T3/NIH/set0/deepcell_segmentations_v1/part_2/nuclear_movie_3T3_S0P2_same.npz')
X_to_load, y_to_load = data['X'][()], data['y'][()]

print(data.keys())
data_readable_X, data_readable_y = data['X'][()], data['y'][()]
print('X Shape:', data_readable_X.shape)
print('y Shape:', data_readable_y.shape)

['y', 'X']
X Shape: (49, 30, 154, 182, 1)
y Shape: (49, 30, 154, 182, 1)


In [1]:
# Test Combining Mulitiple Different Cell Types/Acquisition Parameters (Test on HeLa + 3T3)
# These types have different sizes (num of frames and pixel dimensions) to compensate we will zero pad

import os
import numpy as np

base_path = '/data/npz_data/cells/3T3/NIH/movie/nuclear_movie_'
list_to_comb = ['3T3_allbutS0P2','hela0-7']

#base_path = '/data/data/cells/3T3/NIH/set0/deepcell_segmentations_v1/part_2/nuclear_movie_'
#list_to_comb = ['3T3_S0P2']

# Define a class to hold both raw data and labels for any dataset 
class Dataset_Xy():
    def __init__(self, X_to_load, y_to_load):
        self.X_to_load = X_to_load
        self.y_to_load = y_to_load

# Define a normalizaiton function for the raw images that can be run before padding
def image_norm(original_image):
    # NNs prefer input data that is 0 mean and unit variance
    normed_image = (original_image - np.mean(original_image)) / np.std(original_image)
    return normed_image

# Load each movie and get max/min parameters for padding
max_x = 0
max_y = 0
max_frames = 0
datasets = {}
for dataset in list_to_comb:
    path = os.path.join(base_path + str(dataset) + '_same.npz')
    #path = os.path.join(base_path + str(dataset) + '.npz')
    data = np.load(path)
    d = Dataset_Xy(data['X'], data['y'])
    datasets[dataset] = d
    if d.X_to_load.shape[1] > max_frames:
        max_frames = d.X_to_load.shape[1]
    if d.X_to_load.shape[2] > max_x:
        max_x = d.X_to_load.shape[2]
    if d.X_to_load.shape[3] > max_y:
        max_y = d.X_to_load.shape[3]

# Instantiate arrays to hold the final training data 
X_full = np.zeros((0, max_frames, max_x, max_y, 1))
y_full = np.zeros((0, max_frames, max_x, max_y, 1))

print(X_full.shape)    
print(y_full.shape)

# Norm images and Pad smaller movies (3T3 in this case) in prep for combining - then combine
for dataset in list_to_comb:
    dtl = datasets[dataset]
    # Normalize the raw images
    for batch in range(dtl.X_to_load.shape[0]):
        for frame in range(dtl.X_to_load.shape[1]):
            dtl.X_to_load[batch, frame, :, :, 0] = image_norm(dtl.X_to_load[batch, frame, :, :, 0]) 
    # Image padding
    if dtl.X_to_load.shape[2] < max_x:
        pad_width = int((max_x-dtl.X_to_load.shape[2])/2)
        dtl.X_to_load = np.pad(dtl.X_to_load, ((0,0), (0,0), (pad_width,pad_width), (0,0), (0,0)), mode='constant', constant_values=0)
        dtl.y_to_load = np.pad(dtl.y_to_load, ((0,0), (0,0), (pad_width,pad_width), (0,0), (0,0)), mode='constant', constant_values=0)
    if dtl.X_to_load.shape[3] < max_y:
        pad_width = int((max_y-dtl.X_to_load.shape[3])/2)
        dtl.X_to_load = np.pad(dtl.X_to_load, ((0,0), (0,0), (0,0), (pad_width,pad_width), (0,0)), mode='constant', constant_values=0)
        dtl.y_to_load = np.pad(dtl.y_to_load, ((0,0), (0,0), (0,0), (pad_width,pad_width), (0,0)), mode='constant', constant_values=0)
    if dtl.X_to_load.shape[1] < max_frames:   
        pad_width = int(max_frames-dtl.X_to_load.shape[1])
        dtl.X_to_load = np.pad(dtl.X_to_load, ((0,0), (0,pad_width), (0,0), (0,0), (0,0)), mode='constant', constant_values=0)
        dtl.y_to_load = np.pad(dtl.y_to_load, ((0,0), (0,pad_width), (0,0), (0,0), (0,0)), mode='constant', constant_values=0)
    # Add to final training data
    X_full = np.concatenate((X_full, dtl.X_to_load), axis=0)
    y_full = np.concatenate((y_full, dtl.y_to_load), axis=0)

print(X_full.shape)    
print(y_full.shape)

# Save the result to a new npz

output_directory = '/data/npz_data/cells/3T3/NIH/movie/'
file_name_save = os.path.join( output_directory, 'nuclear_movie_3T3_and_HeLa_V3.npz')

#output_directory = '/data/data/cells/3T3/NIH/set0/deepcell_segmentations_v1/part_2/'
#file_name_save = os.path.join( output_directory, 'nuclear_movie_3T3_S0P2_same.npz')

np.savez(file_name_save, X=X_full, y=y_full)

(0, 40, 216, 256, 1)
(0, 40, 216, 256, 1)
(387, 40, 216, 256, 1)
(387, 40, 216, 256, 1)


In [38]:
# Review Data if neccesary
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from IPython.display import HTML

def get_js_video(images, batch=0, channel=0):
    fig = plt.figure()    
    ims = []
    for i in range(images.shape[1]):
        im = plt.imshow(images[batch, i, :, :, channel], animated=True, cmap='cubehelix', vmin=0, vmax=15)
        ims.append([im])
        ani = animation.ArtistAnimation(fig, ims, interval=75, repeat_delay=1000)
    plt.close()
    return HTML(ani.to_jshtml())

get_js_video(X_full, batch=0)

In [7]:
# Check the result
data = np.load(file_name_save)
print(data.keys())
data_readable_X, data_readable_y = data['X'][()], data['y'][()]
print('X Shape:', data_readable_X.shape)
print('y Shape:', data_readable_y.shape)

['y', 'X']
X Shape: (349, 40, 216, 256, 1)
y Shape: (349, 40, 216, 256, 1)


### Make Training Data (Division Information)

In [2]:
import os
import sys
import numpy as np
import pandas as pd
import scipy as sp

csv_path = '/data/npz_data/cells/3T3/NIH/movie/divisions-3T3.csv'
#csv_path = '/data/npz_data/cells/HeLa/S3/movie/divisions-HeLa.csv'

#Open .csv file containing hand-curated cell division data
divisions_csv = pd.read_csv(csv_path)

#Convert nan entries to blanks, i.e. ''
divisions_csv = divisions_csv.replace(np.nan, 0, regex=True)

In [3]:
#Throw away all montages we decided not to use for training

throw_away_indices = divisions_csv.loc[:, r'dont use (true)'] == True

keep_indices = ~throw_away_indices

divisions_csv = divisions_csv.loc[keep_indices,:]

divisions_csv.head(6201)

Unnamed: 0,number,cell_type,set,part,montage,label,daughter,frame_div (0-index),no cells (true),dont use (true),Notes
0,1,3T3,0,1,00_00,1,0,0.0,0.0,0,0
1,1,3T3,0,1,00_00,2,0,0.0,0.0,0,0
2,1,3T3,0,1,00_00,3,0,0.0,0.0,0,0
3,1,3T3,0,1,00_00,4,56,8.0,0.0,0,0
4,1,3T3,0,1,00_00,5,0,0.0,0.0,0,0
5,1,3T3,0,1,00_00,6,0,0.0,0.0,0,0
6,1,3T3,0,1,00_00,7,89,17.0,0.0,0,0
7,1,3T3,0,1,00_00,8,0,0.0,0.0,0,0
8,1,3T3,0,1,00_00,9,0,0.0,0.0,0,0
9,1,3T3,0,1,00_00,10,0,0.0,0.0,0,0


In [4]:
# Total number of divisions
num_divisions = divisions_csv['daughter'].astype(bool).sum(axis=0)
print(num_divisions)

177


In [36]:
# unique_montages contains a list of each montage grouped by set - index by unique_montages[set][montage = 00_0, 00_1]

# Use for data with no "parts"

def division_per_montage(set_num, montage):
    parents = []
    daughters = []
    for row in divisions_csv.itertuples():
        if row.set == set_num and row.montage == montage and row.daughter != 0:
            parents.append(row.label)
            daughter_values = [int(x) for x in row.daughter.split(',')]
            daughters.append(daughter_values)

    npz_arr = []
    for i in range(31):
        npz_arr.append(np.array([]))
    for idx, parent in enumerate(parents):
        ind = int(parent)
        npz_arr[ind] = np.array(daughters[idx])
    
    return npz_arr

unique_montages = divisions_csv['montage'].groupby(divisions_csv['set']).unique()

children = []
for set_num in divisions_csv['set'].unique():
    for montage in unique_montages[set_num]:
        arr_to_append = division_per_montage(set_num, montage)
        children.append(arr_to_append)
        
for batch in range(len(children)):
    for i, lst in enumerate(children[batch]):
        children[batch][i] = np.asarray(lst, dtype='int32')

children = np.array(children)
np.savez('/data/npz_data/cells/3T3/NIH/movie/nuclear_movie_3T3_S0P1andS2P2_same_kids.npz', daughters=children)

In [5]:
# Use for data with multiple "parts"

# Updated def of division_per_montage to accommodate multiple parts
def division_per_montage(set_num, part_num, montage):
    parents = []
    daughters = []
    for row in divisions_csv.itertuples():
        if row.set == set_num and row.part == part_num and row.montage == montage and row.daughter != 0:
            parents.append(row.label)
            daughter_values = [int(x) for x in row.daughter.split(',')]
            daughters.append(daughter_values)

    npz_arr = []
    for i in range(31):
        npz_arr.append(np.array([]))
    for idx, parent in enumerate(parents):
        ind = int(parent)
        npz_arr[ind] = np.array(daughters[idx])
    
    return npz_arr


# unique_montages contains a list of each montage grouped by set - index by unique_montages[set][montage = 00_0, 00_1]
unique_montages = divisions_csv['montage'].groupby([divisions_csv['set'],divisions_csv['part']]).unique()

children = []
for set_num in divisions_csv['set'].unique():
    for part_num in divisions_csv['part'].unique():        
        if set_num == 0 and part_num == 2:
            print("skipping set 0 part 2")
        else:
            for montage in unique_montages[set_num, part_num]:        
                arr_to_append = division_per_montage(set_num, part_num, montage)
                children.append(arr_to_append)
        
for batch in range(len(children)):
    for i, lst in enumerate(children[batch]):
        children[batch][i] = np.asarray(lst, dtype='int32')

children = np.array(children)
np.savez('/data/npz_data/cells/3T3/NIH/movie/nuclear_movie_3T3_allbutS0P2_same_kids.npz', daughters=children)



skipping set 0 part 2


In [8]:
print(unique_montages[2, 2].shape)

(45,)


### Lets check against the original file for formatting

In [30]:
# Look at the original kids npz to see if it looks correct
data = np.load('/data/npz_data/cells/HeLa/S3/movie/nuclear_movie_hela0-7_same_kids.npz')
data.keys()

['daughters']

In [31]:
# Store data from keys to deconsruct
data_readable = data['daughters']
data_readable.shape

(180, 31)

In [32]:
# Check the first two entries for structure
data_readable[0:2,:]

array([[array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([8, 9], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32)],
       [array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        arr

In [73]:
# Look at the original kids npz to see if it looks correct
data = np.load('/data/npz_data/cells/3T3/NIH/movie/nuclear_movie_3T3_allbutS0P2-S2P1_same_kids.npz')
data.keys()

['daughters']

In [74]:
# Store data from keys to deconsruct
data_readable = data['daughters']
data_readable.shape

(169, 31)

In [75]:
# Check the first two entries for structure
data_readable[0:2,:]

array([[array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([5, 6], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([8, 9], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32)],
       [array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
       

In [2]:
# Compile the two separate daughter lists
_3T3_daughter = np.load('/data/npz_data/cells/3T3/NIH/movie/nuclear_movie_3T3_allbutS0P2_same_kids.npz')
_3T3_daughter_readable = _3T3_daughter['daughters']
print(_3T3_daughter_readable.shape)

hela_daughter = np.load('/data/npz_data/cells/HeLa/S3/movie/nuclear_movie_hela0-7_same_kids.npz')
hela_daughter_readable = hela_daughter['daughters']
print(hela_daughter_readable.shape)

all_daughters = np.concatenate((_3T3_daughter_readable, hela_daughter_readable), axis=0)
print(all_daughters.shape)

np.savez('/data/npz_data/cells/3T3/NIH/movie/nuclear_movie_3T3_and_HeLa_V3_kids.npz', daughters=all_daughters)


(207, 31)
(180, 31)
(387, 31)


In [77]:
import numpy as np

# Look at the original kids npz to see if it looks correct
data = np.load('/data/npz_data/cells/3T3/NIH/movie/nuclear_movie_3T3_and_HeLa_V2_kids.npz')
print(data.keys())
data_readable = data['daughters']
print(data_readable.shape)


['daughters']
(349, 31)


## Combining TRK Files

In [1]:
from io import BytesIO

import json
import numpy as np
import os
import pathlib
import tarfile
import tempfile

def save_trks(filename, lineages, raw, tracked):
    if not filename.endswith(".trks"):
        raise ValueError("filename must end with '.trks'")

    with tarfile.open(filename, "w") as trks:
        with tempfile.NamedTemporaryFile("w") as lineages_file:
            json.dump(lineages, lineages_file, indent=1)
            lineages_file.flush()
            trks.add(lineages_file.name, "lineages.json")

        with tempfile.NamedTemporaryFile() as raw_file:
            np.save(raw_file, raw)
            raw_file.flush()
            trks.add(raw_file.name, "raw.npy")

        with tempfile.NamedTemporaryFile() as tracked_file:
            np.save(tracked_file, tracked)
            tracked_file.flush()
            trks.add(tracked_file.name, "tracked.npy")


def save_trk(filename, lineage, raw, tracked):
    if not filename.endswith(".trk"):
        raise ValueError("filename must end with '.trk'")

    with tarfile.open(filename, "w") as trks:
        with tempfile.NamedTemporaryFile("w") as lineage_file:
            json.dump(lineage, lineage_file, indent=1)
            lineage_file.flush()
            trks.add(lineage_file.name, "lineage.json")

        with tempfile.NamedTemporaryFile() as raw_file:
            np.save(raw_file, raw)
            raw_file.flush()
            trks.add(raw_file.name, "raw.npy")

        with tempfile.NamedTemporaryFile() as tracked_file:
            np.save(tracked_file, tracked)
            tracked_file.flush()
            trks.add(tracked_file.name, "tracked.npy")
            
            
def load_trk(filename):
    with tarfile.open(filename, "r") as trks:
        # trks.extractfile opens a file in bytes mode, json can't use bytes.
        lineage = json.loads(
                trks.extractfile(
                    trks.getmember("lineage.json")).read().decode())

        # numpy can't read these from disk...
        array_file = BytesIO()
        array_file.write(trks.extractfile("raw.npy").read())
        array_file.seek(0)
        raw = np.load(array_file)
        array_file.close()

        array_file = BytesIO()
        array_file.write(trks.extractfile("tracked.npy").read())
        array_file.seek(0)
        tracked = np.load(array_file)
        array_file.close()

    # JSON only allows strings as keys, so we convert them back to ints here
    lineage = {int(k): v for k, v in lineage.items()}

    return {"lineage": lineage, "raw": raw, "tracked": tracked}

def trk_folder_to_trks(dirname, trks_filename):
    lineages = []
    raw = []
    tracked = []

    for filename in os.listdir(dirname):
        trk = load_trk(os.path.join(dirname, filename))
        lineages.append(trk["lineage"])
        raw.append(trk["raw"])
        tracked.append(trk["tracked"])

    save_trks(trks_filename, lineages, raw, tracked)
    
def npz_and_kids_to_trks(filename, filename_kids, outfilename):
    data = np.load(filename)
    kids = np.load(filename_kids)

    raw = data["X"]
    tracked = data["y"]

    # convert kids["daughters"] to a list of dicts
    daughters = []
    for daughters_batch in kids["daughters"]:
        d = {}
        for i, lst in enumerate(daughters_batch):
            if i == 0:
                continue
            d[i] = list(map(int, lst))
        daughters.append(d)

    lineages = generate_lineages(tracked, daughters)

    save_trks(outfilename, lineages, raw, tracked)

def generate_lineages(tracked, daughters):
    """
    generates dictionary equivalent to `lineages.json` in .trks files.
    """
    return [generate_lineage(tracked[batch], daughters[batch])
            for batch in range(tracked.shape[0])]

def generate_lineage(tracked, daughters):
    """
    generates dictionary equivalent to `lineage.json` in .trk files.
    these WILL be missing `capped` and `frame_div`, since there is no way
    to always correctly infer this information.
    """

    lineage = {}

    # fill in `label` & `frames`
    for frame in range(tracked.shape[0]):
        X = tracked[frame]
        for cell in map(int, np.unique(X)):
            if cell == 0:
                continue
            if cell not in lineage:
                lineage[cell] = {"label": cell,
                                 "frames": [frame],
                                 "parent": None,
                                 "daughters": list(map(int, daughters[cell]))}
            else:
                lineage[cell]["frames"].append(frame)

    # fill in `parent` & `daughters`
    for cell, track in lineage.items():
        for c in track["daughters"]:
            lineage[c]["parent"] = cell

    return lineage

In [None]:
# Imports
import os

from deepcell.utils.data_utils import load_trks as load_trks
from deepcell.utils.misc_utils import sorted_nicely

# Define a normalizaiton function for the raw images that can be run before padding
def image_norm(original_image):
    # NNs prefer input data that is 0 mean and unit variance
    normed_image = (original_image - np.mean(original_image)) / np.std(original_image)
    return normed_image

# Define trk files to add in
s0p2_dir = '/data/data/cells/3T3/NIH/set0/deepcell_segmentations_v1/part_2/curated_tracks'
new_trks = os.listdir(s0p2_dir)
new_trks_sorted = sorted_nicely(new_trks)

# Create trks file from existing npz + kids npz
npz_file = '/data/npz_data/cells/3T3/NIH/movie/nuclear_movie_3T3_allbutS0P2_same'
npz_and_kids_to_trks(os.path.join(npz_file+'.npz'),os.path.join(npz_file+'_kids.npz'), os.path.join(npz_file+'.trks'))

# Assemble existing tracks in a folder to hold the entire dataset (sets 0-2)
dirname = '/data/npz_data/cells/3T3/NIH/movie/nuclear_movie_3T3_0-2_same'  # Define folder save location

trks_file = '/data/npz_data/cells/3T3/NIH/movie/nuclear_movie_3T3_allbutS0P2_same.trks'
trks = load_trks(trks_file)
for i, (lineage, raw, tracked) in enumerate(zip(trks["lineages"], trks["X"], trks["y"])):
    if i < 44:
        movie_counter = i
    else:
        movie_counter = i + len(new_trks_sorted)
        
    save_trk(os.path.join(dirname, "batch_{}.trk".format(movie_counter)),
             lineage,
             raw,
             tracked)

# Store dimensions of raw and tracked to check new data against and pad if neccesary
max_frames = raw.shape[0]
max_y = raw.shape[1]
max_x = raw.shape[2]

# Insert tracks in other folder into the current dataset 
for k, filename in enumerate(new_trks_sorted):
    movie_counter = 44 + k
    trk = load_trk(os.path.join(s0p2_dir, filename))
    lineage = trk["lineage"]
    raw = trk["raw"]
    tracked = trk["tracked"]
    
    # Normalize the raw images
    for frame in range(raw.shape[0]):
        raw[frame, :, :, 0] = image_norm(raw[frame, :, :, 0]) 
    # Image padding - This assumes that raw and tracked have the same shape
    if raw.shape[1] < max_y:
        pad_width = int((max_y-raw.shape[1])/2)
        raw = np.pad(raw, ((0,0), (pad_width,pad_width), (0,0), (0,0)), mode='constant', constant_values=0)
        tracked = np.pad(tracked, ((0,0), (pad_width,pad_width), (0,0), (0,0)), mode='constant', constant_values=0)
    if raw.shape[2] < max_x:
        pad_width = int((max_x-raw.shape[2])/2)
        raw = np.pad(raw, ((0,0), (0,0), (pad_width,pad_width), (0,0)), mode='constant', constant_values=0)
        tracked = np.pad(tracked, ((0,0), (0,0), (pad_width,pad_width), (0,0)), mode='constant', constant_values=0)
    if raw.shape[0] < max_frames:   
        pad_width = int(max_frames-raw.shape[0])
        raw = np.pad(raw, ((0,pad_width), (0,0), (0,0), (0,0)), mode='constant', constant_values=0)
        tracked = np.pad(tracked, ((0,pad_width), (0,0), (0,0), (0,0)), mode='constant', constant_values=0)

    save_trk(os.path.join(dirname, "batch_{}.trk".format(movie_counter)),
             lineage,
             raw,
             tracked)

# Transofrm folder into trks file
trk_folder_to_trks(dirname, "/data/npz_data/cells/3T3/NIH/movie/nuclear_movie_3T3_0-2_same.trks")

In [78]:
# Imports
import os

from deepcell.utils.data_utils import load_trks as load_trks
from deepcell.utils.misc_utils import sorted_nicely

# Define a normalizaiton function for the raw images that can be run before padding
def image_norm(original_image):
    # NNs prefer input data that is 0 mean and unit variance
    normed_image = (original_image - np.mean(original_image)) / np.std(original_image)
    return normed_image

# Define trk files to add in
s0p2_dir = '/data/data/cells/3T3/NIH/set0/deepcell_segmentations_v1/part_2/curated_tracks'
new_trks = os.listdir(s0p2_dir)
new_trks_sorted = sorted_nicely(new_trks)

# Load trks file
trks_file = '/data/npz_data/cells/3T3/NIH/movie/HeLa_and_3T3allbutS0P2.trks'

# Assemble existing tracks using a folder to hold the entire dataset
dirname = '/data/npz_data/cells/3T3/NIH/movie/combiningtracks_new'    # Define folder save location

trks = load_trks(trks_file)
for i, (lineage, raw, tracked) in enumerate(zip(trks["lineages"], trks["raw"], trks["tracked"])):
    if i < 44:
        movie_counter = i
    else:
        movie_counter = i + len(new_trks_sorted)
        
    save_trk(os.path.join(dirname, "batch_{}.trk".format(movie_counter)),
             lineage,
             raw,
             tracked)

# Store dimensions of raw and tracked to check new data against and pad if neccesary
max_frames = raw.shape[0]
max_y = raw.shape[1]
max_x = raw.shape[2]

# Insert tracks in other folder into the current dataset 
for k, filename in enumerate(new_trks_sorted):
    movie_counter = 44 + k
    trk = load_trk(os.path.join(s0p2_dir, filename))
    lineage = trk["lineage"]
    raw = trk["raw"]
    tracked = trk["tracked"]
    
    # Normalize the raw images
    for frame in range(raw.shape[0]):
        raw[frame, :, :, 0] = image_norm(raw[frame, :, :, 0]) 
    # Image padding - This assumes that raw and tracked have the same shape
    if raw.shape[1] < max_y:
        pad_width = int((max_y-raw.shape[1])/2)
        raw = np.pad(raw, ((0,0), (pad_width,pad_width), (0,0), (0,0)), mode='constant', constant_values=0)
        tracked = np.pad(tracked, ((0,0), (pad_width,pad_width), (0,0), (0,0)), mode='constant', constant_values=0)
    if raw.shape[2] < max_x:
        pad_width = int((max_x-raw.shape[2])/2)
        raw = np.pad(raw, ((0,0), (0,0), (pad_width,pad_width), (0,0)), mode='constant', constant_values=0)
        tracked = np.pad(tracked, ((0,0), (0,0), (pad_width,pad_width), (0,0)), mode='constant', constant_values=0)
    if raw.shape[0] < max_frames:   
        pad_width = int(max_frames-raw.shape[0])
        raw = np.pad(raw, ((0,pad_width), (0,0), (0,0), (0,0)), mode='constant', constant_values=0)
        tracked = np.pad(tracked, ((0,pad_width), (0,0), (0,0), (0,0)), mode='constant', constant_values=0)

    save_trk(os.path.join(dirname, "batch_{}.trk".format(movie_counter)),
             lineage,
             raw,
             tracked)

# Transofrm folder into trks file
trk_folder_to_trks(dirname, "/data/npz_data/cells/3T3/NIH/movie/3T3_and_HeLa.trks")

## Read TRK Files to Provide Statistics

In [80]:
# Find the Number of cell tracks, the Number of frames per track, and the Number of divisions

#imports
import os
from deepcell.utils.data_utils import load_trks as load_trks

# load the trks file
direc_data = '/data/npz_data/cells/3T3/NIH/movie/'
dataset = 'nuclear_movie_3T3_and_HeLa_complete'
#dataset = 'HeLa_and_3T3allbutS0P2'

trks_file_name = os.path.join(direc_data, dataset + ".trks")

training_data = load_trks(trks_file_name)
X = training_data["raw"]
y = training_data["tracked"]

# `daughters` is of the form
#
#                   2 children / cell (potentially empty)
#                          ___________|__________
#                         /                      \
#      daughers = [{id_1: [daughter_1, daughter_2], ...}, ]
#                  \___________________________________/
#                                    |
#                       dict of (cell_id -> children)
#
# each batch has a separate (cell_id -> children) dict
daughters = [{cell: fields["daughters"]
             for cell, fields in tracks.items()}
            for tracks in training_data["lineages"]]

print("Image data shape: ", X.shape)
print("Number of lineages (should equal batch size): ", len(training_data["lineages"]))

Image data shape:  (416, 40, 216, 256, 1)
Number of lineages (should equal batch size):  416


In [81]:
import numpy as np

total_tracks = 0
total_divisions = 0
avg_frame_counts_in_batches = []
for batch, daughter_batch in enumerate(daughters):
    num_tracks_in_batch = len(daughter_batch)
    num_div_in_batch = len([children for children in daughter_batch if daughter_batch[children]])
    total_tracks = total_tracks + num_tracks_in_batch
    total_divisions = total_divisions + num_div_in_batch
    frame_counts = []
    for cell_id in daughter_batch.keys():
        frame_count = 0
        for frame in y[batch]:
            cells_in_frame = np.unique(frame)
            if cell_id in cells_in_frame:
                frame_count += 1
        frame_counts.append(frame_count)
    avg_frame_counts_in_batches.append(np.average(frame_counts))
avg_num_frames_per_track = np.average(avg_frame_counts_in_batches)

print("Dataset Statistics:")
print("Total number of unique tracks (cells) - ", total_tracks)
print("Total number of divisions             - ", total_divisions)
print("Average number of frames per track    - ", int(avg_num_frames_per_track))

Dataset Statistics:
Total number of unique tracks (cells) -  4366
Total number of divisions             -  315
Average number of frames per track    -  28


# EOF

In [None]:
from deepcell.utils.data_utils import load_trks as load_trks

# Load trks file
trk_file = '/data/data/ISBI_Tracking_Challenge/HeLa/nuc/HeLa_GTSEG_01_Batch00.trk'
trk = load_trk(trk_file)

for i, (lineage, raw, tracked) in enumerate(zip(trk["lineage"], trk["raw"], trk["tracked"])):
    print(raw.shape)
    print(tracked.shape)


In [11]:
raw = np.load("/data/data/ISBI_Tracking_Challenge/HL60/nuc/Fluo-N2DH-SIM_Training/HL60_nuc_Training/01/raw.npy")
tracked = np.load("/data/data/ISBI_Tracking_Challenge/HL60/nuc/Fluo-N2DH-SIM_Training/HL60_nuc_Training/01/tracked.npy")

In [24]:
import matplotlib.pyplot as plt

channel = 0

for i in range(len(raw)):
#    name_raw = os.path.join('/data/data/ISBI_Tracking_Challenge/HL60/nuc/Fluo-N2DH-SIM_Training/HL60_nuc_Training/01/test_img_gif/raw_{:02}_.png'.format(i))
    name_tracked = os.path.join('/data/data/ISBI_Tracking_Challenge/HL60/nuc/Fluo-N2DH-SIM_Training/HL60_nuc_Training/01/test_img_gif/tracked_{:02}_.png'.format(i))
#    plt.imsave(name_raw, raw[i][:, :, channel], cmap='gray')
    plt.imsave(name_tracked, tracked[i][:, :, channel], cmap='cubehelix', vmin=0, vmax=50)



In [None]:


divisions_csv.loc[(divisions_csv['column_name'] != 0) & (divisions_csv['montage' == montage)]

In [None]:
setlst = os.listdir('./')
all_sets = []
for term in setlst:
    if 'set' in term:
        all_sets.append(term)

for set in all_sets:
    temp = os.listdir(os.path.join('.', set, ))
    base_direc = os.path.join('.', set, 'movie')
    output_path = os.path.join('.', set, 'final')
    partslst = []
    if not 'annotations' in temp:
        partslst = os.listdir(os.path.join('.', set))
    print(partslst)
        if len(partslst) == 0:
            print(base_direc, output_path)
            combine(base_direc, output_path)
        else:
            for part in partslst:
                base_direc = os.path.join('.', set, part, 'movie')
                output_path = os.path.join('.', set, part, 'final')
                combine(base_direc, output_path)