In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

%cd ../..

!hostname

/p/fastdata/pli/Private/oberstrass1/datasets/vervet1818-3d
jrlogin02.jureca


In [2]:
import os

import re
import pandas as pd
import numpy as np

import h5py as h5

import pli
import pli.image as im

from tqdm import tqdm

In [3]:
# Get mask and feature info

model_name = "pli_transret_histo"

feature_path = f"data/aa/pca/{model_name}"
mask_path = "data/aa/masks/cortex/"

# Group of the mask in the H5 files
mask_group = 'Image'

# Group of the features in the H5 files
feature_group = "PCA"

###

p = re.compile('.*s([0-9]{4})_.*h5')

feature_list = []
for f in sorted(os.listdir(feature_path)):
    match = p.match(f)
    if match:
        id = int(match[1])
        with h5.File(os.path.join(feature_path, f)) as h5f:
            spacing = h5f[feature_group].attrs['spacing']
            origin = h5f[feature_group].attrs['origin']
        feature_list.append({'id': id, 'spacing': spacing, 'origin': origin, 'file_features': os.path.join(feature_path, f)})
feature_df = pd.DataFrame(feature_list)

mask_list = []
for f in sorted(os.listdir(mask_path)):
    match = p.match(f)
    if match:
        id = int(match[1])
        mask_list.append({'id': id, 'file_mask': os.path.join(mask_path, f)})
mask_df = pd.DataFrame(mask_list)

files_df = mask_df.merge(feature_df, on='id', how='inner').sort_values('id').reset_index(drop=True)

files_df.head()

Unnamed: 0,id,file_mask,spacing,origin,file_features
0,841,data/aa/masks/cortex/Vervet1818aa_60mu_70ms_s0...,"[84.37760192, 84.37760192]","[0.0, 0.0]",data/aa/pca/pli_transret_histo/Vervet1818aa_60...
1,842,data/aa/masks/cortex/Vervet1818aa_60mu_70ms_s0...,"[84.37760192, 84.37760192]","[0.0, 0.0]",data/aa/pca/pli_transret_histo/Vervet1818aa_60...
2,843,data/aa/masks/cortex/Vervet1818aa_60mu_70ms_s0...,"[84.37760192, 84.37760192]","[0.0, 0.0]",data/aa/pca/pli_transret_histo/Vervet1818aa_60...
3,844,data/aa/masks/cortex/Vervet1818aa_60mu_70ms_s0...,"[84.37760192, 84.37760192]","[0.0, 0.0]",data/aa/pca/pli_transret_histo/Vervet1818aa_60...
4,845,data/aa/masks/cortex/Vervet1818aa_60mu_70ms_s0...,"[84.37760192, 84.37760192]","[0.0, 0.0]",data/aa/pca/pli_transret_histo/Vervet1818aa_60...


In [4]:
# Load all PCA sections

# Coressponding pyramid to the Feature size
mask_pyramid = 6

# Masking of features to include foreground only
mask_features = True
background_class = 3

# Smoothing of the features
sigma = 1.0

###

from skimage import filters
from vervet1818_3d.utils.io import read_masked_features

selected_features = []
selected_masks = []

for k, r in tqdm(files_df.sort_values('id').iterrows(), total=len(files_df)):
    features, mask = read_masked_features(
        r.file_features,
        r.file_mask,
        mask_pyramid=mask_pyramid,
        data_group=feature_group,
        mask_group=mask_group
    )
    assert features.shape[:2] == mask.shape, f"{features.shape[:2]} differs from {mask.shape}"

    # Smooth features a bit
    if sigma > 0.:
        features = filters.gaussian(features, multichannel=True, sigma=sigma)

    selected_features.append(features)
    selected_masks.append(mask)

if mask_features:
    valid_features = [f[m != background_class] for f, m in zip(selected_features, selected_masks)]
else:
    valid_features = [sf.reshape(-1, sf.shape[-1]) for sf in selected_features]

valid_lengths = [len(vf) for vf in valid_features]
valid_features = np.vstack(valid_features)

print(f"Valid features have shape {valid_features.shape}")

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 232/232 [00:04<00:00, 47.51it/s]


Valid features have shape (16340715, 4)


In [5]:
n_clusters = 20
n_subfeatures = 32_000
seed = 299792458

###

from sklearn.cluster import KMeans

np.random.seed(seed)

# Reduce to the selected valid components
ix = np.random.choice(np.arange(len(valid_features)), n_subfeatures)

km = KMeans(n_clusters, n_init=10, max_iter=1000, tol=1e-4, random_state=seed)
km.fit(valid_features[ix])

KMeans(max_iter=1000, n_clusters=20, random_state=299792458)

In [6]:
# Create masks

dtype =  np.uint8 # np.uint8, np.bool
background_class = 3

out_folder = f"data/aa/clusters/kmeans/"
out_group = 'Mask'

chunk_size = 128
compression ='gzip'

###

out_path = os.path.join(out_folder, model_name)

if not os.path.exists(out_path):
    print("Create path", out_path)
    os.mkdir(out_path)

print("Write cluster sections to", out_path)
for sf, sm, (k, r) in tqdm(zip(selected_features, selected_masks, files_df.iterrows()), total=len(files_df)):

    # Get predictions
    predictions = km.predict(sf[sm != background_class])

    cluster_array = np.zeros(sf.shape[:2], dtype=dtype)
    cluster_array[sm != background_class] = predictions + 1

    out_file = f"Kmeans_{n_clusters}_s{r.id:04d}.h5"
    out_section = pli.data.Section(image=cluster_array)
    out_section.spacing = r.spacing
    out_section.origin = r.origin
    out_section.modality = 'Mask'
    out_section.attrs['n_clusters'] = n_clusters
    out_section.attrs['cluster_centers'] = km.cluster_centers_

    out_section.to_hdf5(
        os.path.join(out_path, out_file),
        chunk_size=chunk_size,
        compression=compression,
        pyramid=False,
        overwrite=True
    )
    out_section.close_file_handle()

Create path data/aa/clusters/kmeans/pli_transret_histo
Write cluster sections to data/aa/clusters/kmeans/pli_transret_histo


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 232/232 [00:14<00:00, 15.54it/s]
