In [1]:
import os
from pathlib import Path
import pickle
from time import time
import pandas as pd

import matplotlib.pyplot as plt
import numpy as np
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import seaborn as sns

from catch_converter.parse_contours import parse_cvi42ws
import CATCH_utils
from Mini_LL import *

In [2]:
# load cases
bp = '/Users/dietrichhadler/Desktop/Daten/test/data/segm1'
case_paths = [os.path.join(bp,p) for p in os.listdir(bp) if p.endswith('.pickle')]
cases1 = [pickle.load(open(p, 'rb')) for p in case_paths]
bp = '/Users/dietrichhadler/Desktop/Daten/test/data/segm2'
case_paths = [os.path.join(bp,p) for p in os.listdir(bp) if p.endswith('.pickle')]
cases2 = [pickle.load(open(p, 'rb')) for p in case_paths]
cases1 = sorted(cases1, key=lambda c: c.studyinstanceuid)
cases2 = sorted(cases2, key=lambda c: c.studyinstanceuid)

crs1 = [(cr.name, cr.get_cr()) for c in cases1 for cr in c.crs]
crs2 = [(cr.name, cr.get_cr()) for c in cases2 for cr in c.crs]
print(crs1)
print(crs2)

[('LVESV', 103.49857177734376), ('LVEDV', 184.06505126953124), ('RVESV', 0.0), ('RVEDV', 0.0), ('LVESV', 85.64158879876815), ('LVEDV', 151.91115689103995), ('RVESV', 0.0), ('RVEDV', 0.0), ('LVESV', 64.91293540545468), ('LVEDV', 167.98959989979244), ('RVESV', 0.0), ('RVEDV', 0.0)]
[('LVESV', 103.24478759765626), ('LVEDV', 179.65843505859374), ('RVESV', 0.0), ('RVEDV', 0.0), ('LVESV', 92.4678097364143), ('LVEDV', 157.89122290796485), ('RVESV', 0.0), ('RVEDV', 0.0), ('LVESV', 78.00391600297218), ('LVEDV', 183.68545235451745), ('RVESV', 0.0), ('RVEDV', 0.0)]


## Thoughts on Clustering
Preprocessing is essential: GIGO principle<br/>


### We are looking for qualitatively different clusters
* Cluster different contours independently <br/>
* Maybe cluster apical / midv / basal independently? <br/>
* Maybe remove data that wasn't segmented by at least one segmenter? <br/>

### Only cluster a subset of problematic segmentation comparisons
* Problematic can be defined as DSC < value (like 80%)<br/>
* Or HD > 2mm<br/>
* Remove 
* Etc. ...

### Normalize data
* Clustering assumes the existence and comparability of distances. Values must be normalized, we can't have dice in [0, 100] and slice depth percentage in [0, 1] and ml diff in [-14.7, 6.5].<br/>
* Subtract mean, Divide by std dev <br/>
* Possibly PCA whitening to remove collinearity of metrics

# Preprocessing functions

In [3]:
def contour_subset(table, cont_name):
    ret_table = table.copy()
    return ret_table[ret_table['contour name']==cont_name]

def dice_subset_segmentation_failures(table, value=80.0):
    ret_table = table.copy()
    return ret_table[ret_table['DSC']<value]

def hd_subset_segmentation_failures(table, value=2.0):
    ret_table = table.copy()
    return ret_table[ret_table['HD']>value]

def absmldiff_subset_segmentation_failures(table, value=2.0):
    ret_table = table.copy()
    return ret_table[ret_table['abs ml diff']>value]

def add_normalized_values(table):
    ret_table = table.copy()
    names = ['DSC', 'HD', 'ml diff', 'abs ml diff']
    data = ret_table[names].values.astype(np.float64)
    data = (data - np.mean(data, axis=0)) / np.std(data, axis=0)
    for n_i, n in enumerate(names): ret_table[[n+' normalized']] = data[:,n_i]
    return ret_table

def add_pca_whitened_values(table):
    ret_table = table.copy()
    names = ['DSC', 'HD', 'ml diff', 'abs ml diff']
    data = ret_table[names].values.astype(np.float64)
    pca = PCA(whiten=True)
    whitened = pca.fit_transform(data)
    for n_i, n in enumerate(names): ret_table[['pca_whitened_'+str(n_i)]] = whitened[:,n_i]
    return ret_table

In [4]:
case_comps = [Case_Comparison(cases1[i], cases2[i]) for i in range(len(cases1))]
tables = [SAX_CINE_analyzer(cc).get_case_contour_comparison_pandas_dataframe(fixed_phase_first_reader=False) for cc in case_comps]
master_table = pd.concat([t for t in tables])
print('len(master_table): ', len(master_table))

len(master_table):  416


# Myocardial Clustering

In [47]:
# Get myocardium subset
table = contour_subset(master_table, 'lv_myo')

# Get segmentation failures
print('All myos: ', len(table))
table = dice_subset_segmentation_failures(table, value=90.0)
print('Poor myos: ', len(table))

# Normalize this data
table = add_normalized_values(table)
table = add_pca_whitened_values(table)
print('New table columns: ', table.columns)
#display(table)

All myos:  104
Poor myos:  41
New table columns:  Index(['case', 'reader1', 'reader2', 'sop1', 'sop2', 'category', 'slice',
       'max_slices', 'depth_perc', 'phase1', 'phase2', 'contour name', 'DSC',
       'HD', 'ml diff', 'abs ml diff', 'position1', 'position2',
       'has_contour1', 'has_contour2', 'DSC normalized', 'HD normalized',
       'ml diff normalized', 'abs ml diff normalized', 'pca_whitened_0',
       'pca_whitened_1', 'pca_whitened_2', 'pca_whitened_3'],
      dtype='object')


## Scatter plot of metric values and normalized metric values

### Add visualization techniques here
* PCA direction arrows
* Show comparison for point in axis

In [48]:
fig, axes = plt.subplots(1,1,figsize=(16,7))
axes.set_title('Dice vs HD')
sns.scatterplot(ax=axes, data=table, x='DSC', y='HD', hue='abs ml diff', 
                size='abs ml diff')

<AxesSubplot:title={'center':'Dice vs HD'}, xlabel='DSC', ylabel='HD'>

In [57]:
%matplotlib
fig, axes = plt.subplots(1,1,figsize=(16,7))
axes.set_title('PCA c1 vs PCA c2')
sns.scatterplot(ax=axes, data=table, x='pca_whitened_1', y='pca_whitened_2', 
                size='abs ml diff', hue='abs ml diff', picker=4)

def onpick(event):
    ind = event.ind
    print('onpick: ', ind)
    case_name  = table.iloc[ind]['case'].values[0]
    phase      = table.iloc[ind]['category'].values[0]
    reader1    = table.iloc[ind]['reader1'].values[0]
    reader2    = table.iloc[ind]['reader2'].values[0]
    slice_nr   = table.iloc[ind]['slice'].values[0]
    cont_name  = table.iloc[ind]['contour name'].values[0]
    sop1, sop2 = table.iloc[ind]['sop1'].values[0], table.iloc[ind]['sop2'].values[0]
    
    cc     = [cc for cc in case_comps if cc.case1.case_name==case_name][0]
    c1, c2 = cc.case1, cc.case2
    img    = c1.load_dcm(sop1).pixel_array
    cont1, cont2 = c1.load_anno(sop1).get_contour(cont_name), c2.load_anno(sop2).get_contour(cont_name)
    
    fig, ax = plt.subplots(1,3, sharex=True, sharey=True)
    fig.suptitle(case_name + ', Phase: ' + str(phase) + ', Slice: ' + str(slice_nr))
    for i in range(3): ax[i].imshow(img)
    ax[0].set_title(reader1); ax[2].set_title(reader2)
    CATCH_utils.plot_outlines(ax[0], cont1); CATCH_utils.plot_outlines(ax[2], cont2)
    CATCH_utils.plot_geo_face_comparison(ax[1], cont1, cont2)
    fig.tight_layout()
    plt.show()
    
fig.canvas.mpl_connect('pick_event', onpick)
plt.show()

Using matplotlib backend: MacOSX
onpick:  [4]


  concatenate([asarray(t.exterior)[:, :2]] +
  [asarray(r)[:, :2] for r in t.interiors])
  polygon = [Polygon(p) for p in polygon]


onpick:  [4]
