### Make Training Data (Movies/Images)

In [None]:
# This cell is not currently used

import os

setlst = os.listdir('./')
all_sets = []
for term in setlst:
    if 'set' in term:
        all_sets.append(term)

for set in all_sets:
    temp = os.listdir(os.path.join('.', set, ))
    direc_name = os.path.join('.', set, 'movie')
    output_path = os.path.join('.', set, 'final')
    partslst = []
    if not 'annotations' in temp:
        partslst = os.listdir(os.path.join('.', set))
    print(partslst)
    
    

In [43]:
"""
make_training_data_tracking.py

Executing functions for creating npz files containing the training data
Functions will create training data for either
    - Patchwise sampling
    - Fully convolutional training of single image conv-nets
    - Fully convolutional training of movie conv-nets

Files should be placed in training directories with each separate
dataset getting its own folder

@author: David Van Valen
"""

"""
Import packages
"""
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import glob
import os
import pathlib
import skimage as sk
import scipy as sp
from scipy import ndimage
from skimage import feature
from skimage import morphology as morph
from skimage.transform import resize
from sklearn.utils import class_weight
from deepcell import get_image
from deepcell import make_training_data
# from deepcell import format_coord as cf

# Load data
direc_name = '/data/data/cells/HeLa/S3/set7/movie'
output_directory = '/data/npz_data/cells/HeLa/S3/movie/'
file_name_save = os.path.join( output_directory, 'nuclear_movie_HeLa_7_same.npz')
# Training directories are organized according to location within an image
num_x = 5 # Define num of horizontal samples
num_y = 5 # Define num of vertical samples
samples_to_drop = ['00_0','00_1','00_2','04_1'] # Some movies/montages/samples do not contain cells or contain annotation errors
#samples_to_drop = []
# Build list of possible training directories (excluding those to be dropped)
training_direcs = ['0{}_{}'.format(i,j) for i in range(num_x) for j in range(num_y)]
training_direcs = [x for x in training_direcs if x not in samples_to_drop]
channel_names = ["set"]

# Create output ditrectory, if necessary
pathlib.Path(output_directory).mkdir(parents=True, exist_ok=True)

# Create the training data
make_training_data(
    direc_name = direc_name,
    file_name_save = file_name_save,
    channel_names = channel_names,
    dimensionality = 3,
    training_direcs = training_direcs,
    raw_image_direc = "raw",
    annotation_direc = "annotated",
    annotation_name = "",
    border_mode = "same",
    output_mode = "conv",
    num_frames = 40,
    reshape_size = None,
    display = False,
    num_of_frames_to_display = 5,
    verbose = True)


In [4]:
import numpy as np
import os

base_path = '/data/npz_data/cells/HeLa/S3/movie/nuclear_movie_HeLa_'
num_of_sets = 8

# Instantiate arrays to hold the final trading data and fill them
X_full = np.empty((0, 40, 216, 256, 1))
y_full = np.empty((0, 40, 216, 256, 1))
for movie in range(num_of_sets):
    path = os.path.join(base_path + str(movie) + '_same.npz')
    data = np.load(path)
    print(data.keys())
    X_to_load, y_to_load = data['X'], data['y']
    print('X Shape:', X_to_load.shape)
    print('y Shape:', y_to_load.shape)
    X_full = np.concatenate((X_full, X_to_load), axis=0)
    y_full = np.concatenate((y_full, y_to_load), axis=0)
    
# Save the result to a new npz
output_directory = '/data/npz_data/cells/HeLa/S3/movie/'
file_name_save = os.path.join( output_directory, 'nuclear_movie_hela0-7_same.npz')

np.savez(file_name_save, X=X_full, y=y_full)


['y', 'X']
X Shape: (23, 40, 216, 256, 1)
y Shape: (23, 40, 216, 256, 1)
['y', 'X']
X Shape: (23, 40, 216, 256, 1)
y Shape: (23, 40, 216, 256, 1)
['y', 'X']
X Shape: (24, 40, 216, 256, 1)
y Shape: (24, 40, 216, 256, 1)
['y', 'X']
X Shape: (21, 40, 216, 256, 1)
y Shape: (21, 40, 216, 256, 1)
['y', 'X']
X Shape: (22, 40, 216, 256, 1)
y Shape: (22, 40, 216, 256, 1)
['y', 'X']
X Shape: (24, 40, 216, 256, 1)
y Shape: (24, 40, 216, 256, 1)
['y', 'X']
X Shape: (22, 40, 216, 256, 1)
y Shape: (22, 40, 216, 256, 1)
['y', 'X']
X Shape: (21, 40, 216, 256, 1)
y Shape: (21, 40, 216, 256, 1)


In [45]:
# Verify the result
data = np.load('/data/npz_data/cells/HeLa/S3/movie/nuclear_movie_hela0-7_same.npz')
X_to_load, y_to_load = data['X'][()], data['y'][()]

print(x.keys())
data_readable_X, data_readable_y = data['X'][()], data['y'][()]
print('X Shape:', data_readable_X.shape)
print('y Shape:', data_readable_y.shape)

['y', 'X']
X Shape: (180, 40, 216, 256, 1)
y Shape: (180, 40, 216, 256, 1)


### Make Training Data (Division Information)

In [1]:
import os
import sys
import numpy as np
import pandas as pd
import scipy as sp

csv_path = '/data/npz_data/cells/HeLa/S3/movie/divisions-HeLa.csv'

#Open .csv file containing hand-curated cell division data
divisions_csv = pd.read_csv(csv_path)

#Convert nan entries to blanks, i.e. ''
divisions_csv = divisions_csv.replace(np.nan, 0, regex=True)

In [2]:
#Throw away all montages we decided not to use for training

throw_away_indices = divisions_csv.loc[:, r'dont use (true)'] == True

keep_indices = ~throw_away_indices

divisions_csv = divisions_csv.loc[keep_indices,:]

divisions_csv.head(6201)

Unnamed: 0,number,cell_type,set,montage,label,daughter,frame_div (0-index),no cells (true),dont use (true),Notes,Unnamed: 10
0,1,hela,0,00_0,1,0,0.0,0,0,0,0.0
1,1,hela,0,00_0,2,0,0.0,0,0,0,1.0
2,1,hela,0,00_0,3,0,0.0,0,0,0,2.0
3,1,hela,0,00_0,4,0,0.0,0,0,0,3.0
4,1,hela,0,00_0,5,0,0.0,0,0,0,4.0
5,1,hela,0,00_0,6,0,0.0,0,0,0,5.0
6,1,hela,0,00_0,7,0,0.0,0,0,0,6.0
7,1,hela,0,00_0,8,0,0.0,0,0,0,7.0
8,1,hela,0,00_0,9,0,0.0,0,0,0,8.0
9,1,hela,0,00_0,10,89,38.0,0,0,0,9.0


In [3]:
def division_per_montage(set_num, montage):
    parents = []
    daughters = []
    for row in divisions_csv.itertuples():
        if row.set == set_num and row.montage == montage and row.daughter != 0:
            parents.append(row.label)
            daughter_values = [int(x) for x in row.daughter.split(',')]
            daughters.append(daughter_values)

    npz_arr = []
    for i in range(31):
        npz_arr.append(np.array([]))
    for idx, parent in enumerate(parents):
        ind = int(parent)
        npz_arr[ind] = np.array(daughters[idx])
    
    return npz_arr

In [4]:
num_of_sets = 8

# unique_montages contains a list of each montage grouped by set - index by unique_montages[set][montage = 00_0, 00_1]
unique_montages = divisions_csv['montage'].groupby(divisions_csv['set']).unique()

children = []
for set_num in range(num_of_sets):
    for montage in unique_montages[set_num]:
        arr_to_append = division_per_montage(set_num, montage)
        children.append(arr_to_append)
        
for batch in range(len(children)):
    for i, lst in enumerate(children[batch]):
        children[batch][i] = np.asarray(lst, dtype='int32')

children = np.array(children)
np.savez('/data/npz_data/cells/HeLa/S3/movie/nuclear_movie_hela0-7_same_kids.npz', daughters=children)

In [5]:
children

array([[array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), ..., array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32)],
       [array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), ..., array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32)],
       [array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), ..., array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32)],
       ...,
       [array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), ..., array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32)],
       [array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), ..., array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32)],
       [array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), ..., array([], dtype=int32),
 

### Lets check against the original file for formatting

In [6]:
# Look at the original kids npz to see if it looks correct
data = np.load('/data/npz_data/cells/HeLa/S3/movie/nuclear_movie_hela0-7_same_kids.npz')
data.keys()

['daughters']

In [7]:
# Store data from keys to deconsruct
data_readable = data['daughters']
data_readable.shape

(180, 31)

In [13]:
# Check the first two entries for structure
data_readable[0:2,:]

array([[array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([8, 9], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32)],
       [array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        arr

In [14]:
# Look at the original kids npz to see if it looks correct
data = np.load('/data/npz_data/cells/HeLa/S3/movie/combined_daugthers.npz')
data.keys()

['daughters']

In [15]:
# Store data from keys to deconsruct
data_readable = data['daughters']
data_readable.shape

(180, 31)

In [16]:
# Check the first two entries for structure
data_readable[0:2,:]

array([[array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([8, 9], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32)],
       [array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        arr

### Ending Checks

# EOF

In [None]:
# List of all montages 
movies = os.listdir(base_direc)
movies.sort()



children = []
for movie in movies:
    path = os.path.join(base_direc, movie, 'division.npz')
    training_data = np.load(path)
    children.append(training_data['arr_0'].tolist())

for batch in range(len(children)):
    for i, lst in enumerate(children[batch]):
        children[batch][i] = np.asarray(lst, dtype=int32)

children = np.array(children)
#np.savez(os.path.join(output_path, 'combined_daugthers.npz'), daughters=children)

In [None]:
def combine(base_direc, output_path):
    movies = os.listdir(base_direc)
    movies.sort()
    children = []
    for movie in movies:
        path = os.path.join(base_direc, movie, 'division.npz')
        if os.path.isfile(path):
            print(movie)
            training_data = np.load(path)
            children.append(training_data['arr_0'].tolist())

    for batch in range(len(children)):
        for i, lst in enumerate(children[batch]):
            children[batch][i] = np.asarray(lst, dtype=int32)

    children = np.array(children)
    np.savez(os.path.join(output_path, 'combined_daugthers.npz'), daughters=children)
    data = np.load(os.path.join(output_path, 'combined_daugthers.npz'))

In [None]:


divisions_csv.loc[(divisions_csv['column_name'] != 0) & (divisions_csv['montage' == montage)]

In [52]:
count = 0
for row in divisions_csv.itertuples():
    if row.daugter != 0:
        count = count + 1
#        print(divisions_csv['daugter'])
#     print(divisions_csv['set'])   
print('number of divisions:', count)

number of divisions: 102


In [None]:
base_path = '/data/npz_data/cells/HeLa/S3/movie/nuclear_movie_HeLa_'
num_of_sets = 8

for movie in range(num_of_sets):
    path = os.path.join(base_path + str(movie) + 'division.npz')

In [None]:
setlst = os.listdir('./')
all_sets = []
for term in setlst:
    if 'set' in term:
        all_sets.append(term)

for set in all_sets:
    temp = os.listdir(os.path.join('.', set, ))
    base_direc = os.path.join('.', set, 'movie')
    output_path = os.path.join('.', set, 'final')
    partslst = []
    if not 'annotations' in temp:
        partslst = os.listdir(os.path.join('.', set))
    print(partslst)
        if len(partslst) == 0:
            print(base_direc, output_path)
            combine(base_direc, output_path)
        else:
            for part in partslst:
                base_direc = os.path.join('.', set, part, 'movie')
                output_path = os.path.join('.', set, part, 'final')
                combine(base_direc, output_path)

In [None]:
#x = np.empty([2, 31], dtype='int32')
x = []

# Make an empty array for a single montage
npz_arr = []
for i in range(31):
    npz_arr.append([])

# Put two blank montages together
for i in range(2):
    x.append(np.array(npz_arr, dtype='int32'))

x = np.array(x)
x.shape

# Save it as an npz file
# np.savez('/home/HeLa_output/set0_files/04_2/output.npz', npz_arr)

In [58]:
# Build division npz for each montage (movie)
set_num = 0
# unique_montages contains a list of each montage grouped by set - index by unique_montages[set][montage = 00_0, 00_1]
unique_montages = divisions_csv['montage'].groupby(divisions_csv['set']).unique()

parents = []
daughters = []
for row in divisions_csv.itertuples():
    if row.set == set_num and row.montage == '00_0' and row.daugter != 0:
        parents.append(row.label)
        daughter_values = [int(x) for x in row.daugter.split(',')]
        daughters.append(daughter_values)

npz_arr = []
for i in range(31):
    npz_arr.append(np.array([]))
for idx, parent in enumerate(parents):
    ind = int(parent)
    npz_arr[ind] = np.array(daughters[idx])

#np.savez(os.path.join(output_dir, 'division.npz'), npz_arr)
