In [1]:
#automatically reload stuff
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import Utils 
from Constants import Const
from SpatialPreprocessing import *
import Models
import Cluster
import Formatting

In [3]:
pdata = load_pdict(Const.denoised_organ_json)
pdata.keys()

dict_keys(['organs', 'patient_ids', 'patients', 'mask'])

In [4]:
def denoised_pdict_to_array(dpd,key,organ_list=None,pids=None):
    patients = dpd['patients']
    if organ_list is None:
        organ_list = dpd['organs']
    elif not Utils.iterable(organ_list):
        organ_list = [organ_list]
    if pids is None:
        pids = dpd['patient_ids']
    array = []
    for pid in pids:
        row = []
        pentry = patients.get(pid)
        for organ in organ_list:
            odata = pentry.get(organ)
            if odata is None:
                row.append(0)
            else:
                oval = odata.get(key)
                row.append(oval)
        array.append(row)
    return np.array(array)

volumes = denoised_pdict_to_array(pdata,'volume')
distances = denoised_pdict_to_array(pdata,'distances')
denoised_pdict_to_array(pdata,'distances',organ_list =['gtv_composite'])

array([[[ 52.80154038,  30.78206936,  36.54036518, ...,  69.09887051,
          76.45932136,  34.49132434]],

       [[ 56.17275397,  42.60348368,  65.54322554, ...,  90.91117907,
         107.40697545,  61.01381192]],

       [[ 54.54629573,  25.8748004 ,  31.78786091, ...,  64.26695194,
          71.94178839,  36.46034601]],

       ...,

       [[ 35.75664804,  26.21184851,  12.87927653, ...,  66.42124281,
          71.32256685,  32.07368325]],

       [[ 62.69004023,  26.94093477,  30.44478227, ...,  72.60637527,
          78.12025671,  37.29541492]],

       [[ 32.72017839,  36.54834105,  49.79315462, ...,  89.0044678 ,
          94.9572382 ,  36.07136741]]])

In [6]:
#takes in a dictionary of arrays (as in the DataInputer output
#returns a similarity matrix 0-1
#n_jobs > 1 will try to multithread
sim = Models.TssimSimilarity(n_jobs = 4)
sim_matrix = sim.get_similarity_matrix(distances,volumes)
sim_matrix

comparing patient 219 and patient 220: 0.70397209166129024

array([[1.        , 0.65138312, 0.73999466, ..., 0.70809249, 0.73202861,
        0.68980513],
       [0.65138312, 1.        , 0.73460779, ..., 0.72418887, 0.76562086,
        0.72969917],
       [0.73999466, 0.73460779, 1.        , ..., 0.683643  , 0.63849007,
        0.58039612],
       ...,
       [0.70809249, 0.72418887, 0.683643  , ..., 1.        , 0.70285588,
        0.65142515],
       [0.73202861, 0.76562086, 0.63849007, ..., 0.70285588, 1.        ,
        0.76731006],
       [0.68980513, 0.72969917, 0.58039612, ..., 0.65142515, 0.76731006,
        1.        ]])

In [7]:
clusters = Cluster.SimilarityClusterer(4).fit_predict(sim_matrix)
clusters

array([1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1,
       1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2,
       2, 2, 1, 1, 2, 2, 2, 2, 1, 2, 1, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 3, 2, 2,
       3, 2, 1, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 3, 1, 3, 1, 2, 2, 1, 2, 2,
       3, 2, 2, 2, 1, 2, 2, 1, 2, 2, 2, 2, 2, 3, 2, 1, 3, 2, 3, 3, 3, 3,
       3, 2, 3, 3, 2, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 2, 3, 3,
       2, 3, 2, 3, 3, 3, 3, 3, 2, 3, 1, 2, 2, 3, 3, 3, 3, 2, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3,
       3, 3, 3, 3, 3, 4, 4, 3, 3, 3, 4, 3, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4,
       4], dtype=int32)

In [8]:
#takes a similarity matrix and returns a list of the indices of the nearest neighbors
#in order
knn = Models.PatientKNN(match_type = 'default')
knn_matches = knn.get_matches(sim_matrix)
knn_matches

[array([ 22,  92,   9,  19, 166]),
 array([ 92,  22,  19, 182,   9]),
 array([ 22, 104,  23, 211, 171]),
 array([ 22,  92,   9,  19, 166]),
 array([ 22,   9,  92, 104, 166]),
 array([ 22, 104,  23, 211, 107]),
 array([ 22,  92,   9, 166, 104]),
 array([ 22,  92,   9,  19, 166]),
 array([ 22,   7,   9,  92, 104]),
 array([ 7,  4,  8, 22,  6]),
 array([22,  7,  4, 92, 19]),
 array([ 22,  92,  19, 166, 104]),
 array([ 22, 104, 166,  92,  33]),
 array([ 22,  92,  19, 166, 104]),
 array([ 22, 104, 166,  92,  33]),
 array([ 22, 104, 166,  23,  33]),
 array([ 7, 22,  4, 12, 11]),
 array([ 22, 104,   7,  23, 166]),
 array([ 7, 22,  4, 12, 11]),
 array([22,  7,  4, 12, 14]),
 array([22,  7,  4, 12, 92]),
 array([ 22, 104, 166,  23,  33]),
 array([ 4,  7, 15, 14,  8]),
 array([ 7, 15,  4, 12, 14]),
 array([ 7,  4, 15, 12, 14]),
 array([ 92, 166, 104,  33, 206]),
 array([  7, 104,   4, 166, 211]),
 array([ 92, 182, 122, 135,  94]),
 array([ 92,   7, 166, 104,  33]),
 array([ 7,  4, 25, 92, 11]),


In [9]:
def predict_patients(dpd,sim = None, 
                    n_clusters = 4, 
                    n_match_bounds = [5,20], 
                    match_threshold = .95
                   ):
    if sim is None:
        volumes = denoised_pdict_to_array(pdata,'volume')
        distances = denoised_pdict_to_array(pdata,'distances')
        sim = Models.TssimSimilarity(n_jobs = 4).get_similarity_matrix(distances,volumes)
    pids = dpd['patient_ids']
    
    clusterer = Cluster.SimilarityClusterer(n_clusters)
    clusters = clusterer.fit_predict(sim)
    
    knn = Models.PatientKNN(match_type = 'default', 
                          n_match_bounds = n_match_bounds,
                          match_threshold = match_threshold)
    knn_matches = knn.get_matches(sim_matrix)
    
    sim_dict = {}
    for i,pid in enumerate(pids):
        entry = {}
        entry['neighbors'] = [pids[ii] for ii in knn_matches[i]]
        entry['cluster'] = clusters[i]
        entry['similarity'] = sim[i,knn_matches[i]]
        sim_dict[pid] = entry
    return sim_dict

pp = predict_patients(pdata,sim_matrix)
pp

{'10': {'neighbors': ['10071',
   '156',
   '10021',
   '10063',
   '265',
   '177',
   '10108',
   '10077',
   '5056',
   '5058',
   '10143',
   '256'],
  'cluster': 1,
  'similarity': array([0.82646524, 0.7867973 , 0.77720793, 0.77248095, 0.76139851,
         0.75459298, 0.75239693, 0.75175813, 0.75075927, 0.75065103,
         0.74986026, 0.7496496 ])},
 '100': {'neighbors': ['156',
   '10071',
   '10063',
   '34',
   '10021',
   '10014',
   '212',
   '5042',
   '2016',
   '2011',
   '172',
   '155'],
  'cluster': 2,
  'similarity': array([0.80279578, 0.79343738, 0.79189224, 0.78282787, 0.78214166,
         0.77545814, 0.77495828, 0.77318009, 0.77223341, 0.77196678,
         0.77188441, 0.77143309])},
 '10009': {'neighbors': ['10071',
   '177',
   '10075',
   '5075',
   '276',
   '180',
   '173',
   '262',
   '154',
   '10108',
   '265',
   '10021'],
  'cluster': 1,
  'similarity': array([0.93614262, 0.8129744 , 0.80527116, 0.80103066, 0.79335082,
         0.79201686, 0.78894665, 0.7

In [10]:
Utils.np_dict_to_json({'patients':pp}, Const.organ_similarity_results_json,True)

True