In [16]:
from dask import dataframe as dd
import pandas as pd
import os
from dask import array as da

opj = os.path.join

tolerance_xy = 30
tolerance_z = 10

from collections import defaultdict
import numpy as np
from tqdm import tqdm

from scipy.spatial.distance import cdist
from scipy import ndimage
from operator import add

In [2]:
input_path = r'C:\Users\Wojtek\Documents\Doktorat\Astral\examples\A_1_3_adj_copy'

In [17]:
seg_dims_name = r'segmentation_dims.h5'
abs_name = r'segmentation_absolute.h5'

In [18]:
absolute_df = pd.read_hdf(opj(input_path, abs_name), key='df')
dims_df = pd.read_hdf(opj(input_path, seg_dims_name), key='df')

In [21]:
absolute_df.to_hdf(opj(input_path, 'abs1.hdf'), key='df', format='table')
dims_df.to_hdf(opj(input_path, 'dims1.hdf'), key='df', format='table')

In [3]:
abs1 = dd.read_hdf(opj(input_path, 'abs1.hdf'), key='df')
dims1 = dd.read_hdf(opj(input_path, 'dims1.hdf'), key='df')

In [4]:
def get_bounding_box_neighbours(dimensions_df, tolerance_xy, tolerance_z):
    neighbours_dict = defaultdict(dict)
    ids = np.unique(dimensions_df.id.values)
    for i in tqdm(ids):

        shape = dimensions_df.loc[dimensions_df['id'] == i].compute()
        candidate_neighbors = get_neighbor_shapes(shape, tolerance_xy, tolerance_z)
        candidate_ids = list(np.unique(candidate_neighbors.id.values))
        neighbours_dict[i] = candidate_ids

    return neighbours_dict

def get_neighbor_shapes(shape, tolerance_xy, tolerance_z):

    xb, yb, zb = get_tolerance_bounding_box(shape, tolerance_xy, tolerance_z)

    neighbor_shapes = dims1.loc[(dims1['x_min'].between(
        *xb)) | (dims1['x_max'].between(*xb))].compute()
    neighbor_shapes = neighbor_shapes.loc[(neighbor_shapes['y_min'].between(*yb))
                                          | (neighbor_shapes['y_max'].between(*yb))]
    neighbor_shapes = neighbor_shapes.loc[(neighbor_shapes['z_min'].between(*zb))
                                          | (neighbor_shapes['z_max'].between(*zb))]

    return neighbor_shapes

def get_tolerance_bounding_box(shape, tolerance_xy, tolerance_z):

    xl = shape['x_min'].values[0] - tolerance_xy
    yl = shape['y_min'].values[0] - tolerance_xy
    zl = shape['z_min'].values[0] - tolerance_z

    xu = shape['x_max'].values[0] + tolerance_xy
    yu = shape['y_max'].values[0] + tolerance_xy
    zu = shape['z_max'].values[0] + tolerance_z

    return (xl, xu), (yl, yu), (zl, zu)

In [11]:
neighbors_dict = get_bounding_box_neighbours(dims1, tolerance_xy, tolerance_z)

100%|██████████████████████████████████████████████████████████████████████████████| 1394/1394 [00:37<00:00, 36.95it/s]


In [58]:
shape1, neighbour_ids = list(neighbors_dict.keys())[0], list(neighbors_dict.values())[0]

In [7]:
def generate_neighbour_data_for(shape1_id, neighbour_ids):

    abs_df_chunk = abs1.loc[(abs1['id'] == shape1_id)
                                        | (abs1['id'].isin(neighbour_ids))].compute()
    dims_df_chunk = dims1.loc[(dims1['id'] == shape1_id) |
                                           (dims1['id'].isin(neighbour_ids))].compute()

    shape1_id_row_data = []

    for shape2_id in neighbour_ids:
        shape1_id_row = generate_neighbor_row(shape1_id, shape2_id, abs_df_chunk, dims_df_chunk)
        shape1_id_row_data.append(shape1_id_row)
    return shape1_id_row_data

def generate_neighbor_row( shape1_id, shape2_id, abs_df_chunk, dims_df_chunk):
    center_dist_xy, center_dist_z = calculate_euc_dists(dims_df_chunk, shape1_id, shape2_id)

    com_dist_xy, com_dist_t = calculate_center_of_mass_dists(
        dims_df_chunk, abs_df_chunk, shape1_id, shape2_id)

    # row_dict = {
    #     'shape_id_1': shape1_id,
    #     'shape_id_2': shape2_id,
    #     'center_dist_xy': center_dist_xy,
    #     'center_dist_t': center_dist_z,
    #     'center_of_mass_dist_xy': com_dist_xy,
    #     'center_of_mass_dist_t': com_dist_t
    # }

    row = [shape1_id, shape2_id, center_dist_xy, center_dist_z, com_dist_xy, com_dist_t]

    return row

def calculate_euc_dists(ddf, shape1, shape2):
    xy_axis1 = ddf.loc[ddf.id == shape1, ['center_y', 'center_x']].values
    xy_axis2 = ddf.loc[ddf.id == shape2, ['center_y', 'center_x']].values
    center_dist_xy = cdist(xy_axis1, xy_axis2)[0][0]

    z_axis1 = ddf.loc[ddf.id == shape1, ['center_z']].values
    z_axis1 = np.vstack([z_axis1, np.zeros((z_axis1.shape[0]))]).T
    z_axis2 = ddf.loc[ddf.id == shape2, ['center_z']].values
    z_axis2 = np.vstack([z_axis2, np.zeros((z_axis2.shape[0]))]).T

    center_dist_z = cdist(z_axis1, z_axis2)[0][0]

    return center_dist_xy, center_dist_z

def calculate_center_of_mass_dists(ddf, adf, shape1_id, shape2_id):

    shape1 = adf.loc[adf['id'] == shape1_id]
    shape2 = adf.loc[adf['id'] == shape2_id]
    shapes = [shape1, shape2]

    shapes = list(map(lambda df: df[['x', 'y', 'z']], shapes))

    offsets = []

    coms = []

    for shape in shapes:
        offsets.append([shape.x.min(), shape.y.min(), shape.z.min()])
        shape.x = shape.x - shape.x.min()
        shape.y = shape.y - shape.y.min()
        shape.z = shape.z - shape.z.min()

        indices = shape.values
        
        shape_np = np.zeros((indices[:, 0].max() + 1, indices[:, 1].max() + 1, indices[:, 2].max() + 1))
        shape_np[indices[:, 0], indices[:, 1], indices[:, 2]] = 1

        # print(np.unique(shape_np))
        com = ndimage.measurements.center_of_mass(shape_np)
        # print(com)
        com = list(map(lambda x: int(x), com))
        coms.append(com)

    coms_offset = []

    for com, offset in zip(coms, offsets):
        coms_offset.append(list(map(add, com, offset)))

    xy1, xy2 = coms_offset[0][:2], coms_offset[1][:2]
    xy1, xy2 = np.array(xy1), np.array(xy2)
    xy1, xy2 = np.expand_dims(xy1, -1).T, np.expand_dims(xy2, -1).T
    z1, z2 = coms_offset[0][2], coms_offset[1][2]

    com_dist_z = abs(z1-z2)
    com_dist_xy = cdist(xy1, xy2)[0][0]

    return com_dist_xy, com_dist_z

In [85]:
%%time

shape1_neighbor_data = generate_neighbour_data_for(shape1, neighbour_ids)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


Wall time: 3min 32s


In [12]:
shape_ids, shape_neighbour_ids = list(neighbors_dict.keys()), list(neighbors_dict.values())
shape_ids, shape_neighbour_ids = shape_ids[:10], shape_neighbour_ids[:10]



In [20]:
from dask import delayed

@delayed
def get_neighbors(shape_ids, shape_neighbour_ids):
    row_data = []
    for shape_id, neighbor_ids in zip(shape_ids, shape_neighbour_ids):
        shape_neighbor_data = generate_neighbour_data_for(shape_id, neighbor_ids)
        row_data.extend(shape_neighbor_data)
        
    return shape_neighbor_data

In [21]:
data = get_neighbors(shape_ids, shape_neighbour_ids)

In [22]:
%%time

a = data.compute(scheduler='multiprocessing')

Wall time: 13min 17s


In [99]:
shape1_neighbor_data1 = test.compute()

ValueError: Metadata inference failed in `eq`.

Original error is below:
------------------------
ValueError('Lengths must match to compare')

Traceback:
---------
  File "c:\programdata\anaconda3\envs\astro\lib\site-packages\dask\dataframe\utils.py", line 167, in raise_on_meta_error
    yield
  File "c:\programdata\anaconda3\envs\astro\lib\site-packages\dask\dataframe\core.py", line 4929, in elemwise
    meta = partial_by_order(*parts, function=op, other=other)
  File "c:\programdata\anaconda3\envs\astro\lib\site-packages\dask\utils.py", line 1089, in partial_by_order
    return function(*args2, **kwargs)
  File "c:\programdata\anaconda3\envs\astro\lib\site-packages\pandas\core\ops\__init__.py", line 1207, in wrapper
    raise ValueError("Lengths must match to compare")


In [13]:
absolute_df

Unnamed: 0,id,y,x,z,color
0,0,134,908,959,10
1,0,134,909,959,10
2,0,134,910,959,10
3,0,134,911,959,10
4,0,135,908,959,10
...,...,...,...,...,...
34125647,1393,365,531,343,78
34125648,1393,366,528,343,78
34125649,1393,366,529,343,78
34125650,1393,366,530,343,78


In [14]:
dims_df

Unnamed: 0,id,y_min,y_max,x_min,x_max,z_min,z_max,center_y,center_x,center_z
0,0,0,461,575,959,959,1199,230,767,1079
1,1,0,234,160,835,650,876,117,497,763
2,2,192,607,65,565,818,1145,399,315,981
3,3,334,607,341,860,499,660,470,600,579
4,4,185,443,596,959,325,551,314,777,438
...,...,...,...,...,...,...,...,...,...,...
1389,1389,134,141,139,150,1029,1032,137,144,1030
1390,1390,423,433,592,602,1002,1005,428,597,1003
1391,1391,519,535,686,696,440,444,527,691,442
1392,1392,245,255,266,273,398,402,250,269,400


In [None]:
def run(tolerance_xy, tolerance_z, absolute_df, dimensions_df):
    
    