In [1]:
#automatically reload stuff
%load_ext autoreload
%autoreload 2

In [75]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import Utils 
from Constants import Const
from SpatialPreprocessing import *
import Models
import Cluster
import Formatting

In [41]:
pdata = load_pdict(Const.denoised_organ_json)
pdata.keys()

dict_keys(['organs', 'patient_ids', 'patients', 'mask', 'gtvs'])

In [42]:
def denoised_pdict_to_array(dpd,key,organ_list=None,pids=None):
    patients = dpd['patients']
    if organ_list is None:
        organ_list = dpd['organs']
    elif not Utils.iterable(organ_list):
        organ_list = [organ_list]
    if pids is None:
        pids = dpd['patient_ids']
    array = []
    for pid in pids:
        row = []
        pentry = patients.get(pid)
        for organ in organ_list:
            odata = pentry.get(organ)
            if odata is None:
                row.append(0)
            else:
                oval = odata.get(key)
                row.append(oval)
        array.append(row)
    return np.array(array)

volumes = denoised_pdict_to_array(pdata,'volume')
distances = denoised_pdict_to_array(pdata,'distances')
denoised_pdict_to_array(pdata,'distances',organ_list =['gtv_composite'])

array([[[ 53.990244  ,  26.9566633 ,  34.46537775, ...,  47.16324716,
          52.3086767 ,   0.        ]],

       [[ 53.87450849,  38.18709173,  55.89453126, ...,  94.88876553,
         110.10518253,   0.        ]],

       [[ 70.1337866 ,  26.72505832,  50.61214461, ...,  55.87763649,
          61.78852572,   0.        ]],

       ...,

       [[ 66.63556846,  25.38946272,  34.80015269, ...,  31.19893396,
          42.24001371,   0.        ]],

       [[ 56.88218   ,  23.32631631,  27.03749036, ...,  56.47832656,
          54.53081401,   0.        ]],

       [[ 47.0634996 ,  21.54338292,  21.29420479, ...,  66.00995932,
          81.18200813,   0.        ]]])

In [61]:
#takes in a dictionary of arrays (as in the DataInputer output
#returns a similarity matrix 0-1
#n_jobs > 1 will try to multithread
sim = Models.TssimSimilarity(n_jobs = 4)
sim_matrix = sim.get_similarity_matrix(distances,volumes)
sim_matrix

comparing patient 219 and patient 220: 0.72823979757277917

array([[1.        , 0.60346752, 0.68129206, ..., 0.66340482, 0.67508076,
        0.63539123],
       [0.60346752, 1.        , 0.70921245, ..., 0.72010532, 0.74720987,
        0.71132594],
       [0.68129206, 0.70921245, 1.        , ..., 0.6228462 , 0.58879462,
        0.53641564],
       ...,
       [0.66340482, 0.72010532, 0.6228462 , ..., 1.        , 0.67753634,
        0.63136291],
       [0.67508076, 0.74720987, 0.58879462, ..., 0.67753634, 1.        ,
        0.7282398 ],
       [0.63539123, 0.71132594, 0.53641564, ..., 0.63136291, 0.7282398 ,
        1.        ]])

In [91]:
clusters = Cluster.SimilarityClusterer(4).fit_predict(sim_matrix)
clusters

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 2, 2, 1, 1, 2,
       2, 1, 1, 1, 2, 2, 2, 1, 1, 2, 1, 1, 2, 2, 2, 2, 2, 3, 1, 2, 2, 2,
       2, 3, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1, 2, 2, 2,
       2, 2, 1, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 1, 1, 2, 2, 1, 2, 2,
       2, 3, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 3, 2, 3, 2, 2, 2,
       3, 2, 3, 2, 3, 3, 2, 3, 3, 3, 3, 3, 3, 2, 3, 3, 2, 3, 3, 2, 3, 3,
       2, 3, 2, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 4, 4, 4, 4, 4, 3, 4, 4,
       4], dtype=int32)

In [80]:
#takes a similarity matrix and returns a list of the indices of the nearest neighbors
#in order
knn = Models.PatientKNN(match_type = 'default')
knn_matches = knn.get_matches(sim_matrix)
knn_matches

[array([22,  8, 19,  9, 92]),
 array([ 22,   8,  19,  92, 109]),
 array([ 22, 164, 104, 211, 171]),
 array([ 22, 171,   9, 206, 166]),
 array([ 22,   8,   9,  19, 171]),
 array([ 22, 164, 104, 171, 211]),
 array([ 22,   8,   9,   4, 171]),
 array([ 22,   9, 171,  19,  92]),
 array([22,  7,  4,  1,  6]),
 array([ 7,  4, 22,  3,  6]),
 array([ 22,   7,   4, 171,  19]),
 array([ 22,  19,  92, 171, 109]),
 array([ 22, 171, 206,  92, 166]),
 array([ 22,  19,  92, 171, 109]),
 array([ 22, 171, 206, 166,  92]),
 array([ 22, 171,  12, 206, 166]),
 array([22, 12,  7,  4, 11]),
 array([ 17, 171, 206, 166, 100]),
 array([22, 12,  7,  4, 17]),
 array([22, 12,  7,  4, 17]),
 array([ 22,  12,   7, 171,  92]),
 array([ 22, 171, 206, 166, 100]),
 array([22,  7,  4, 12, 14]),
 array([ 17,  12,   7, 164, 104]),
 array([12, 17,  7,  4, 21]),
 array([171,  92, 206, 109,  33]),
 array([25, 12, 17,  7,  4]),
 array([182, 109, 160, 147, 122]),
 array([ 92, 171,  25, 109, 206]),
 array([ 25,  12,   7, 171,   

In [93]:
def predict_patients(dpd,sim = None, 
                    n_clusters = 4, 
                    n_match_bounds = [5,20], 
                    match_threshold = .95
                   ):
    if sim is None:
        volumes = denoised_pdict_to_array(pdata,'volume')
        distances = denoised_pdict_to_array(pdata,'distances')
        sim = Models.TssimSimilarity(n_jobs = 4).get_similarity_matrix(distances,volumes)
    pids = dpd['patient_ids']
    
    clusterer = Cluster.SimilarityClusterer(n_clusters)
    clusters = clusterer.fit_predict(sim)
    
    knn = Models.PatientKNN(match_type = 'default', 
                          n_match_bounds = n_match_bounds,
                          match_threshold = match_threshold)
    knn_matches = knn.get_matches(sim_matrix)
    
    sim_dict = {}
    for i,pid in enumerate(pids):
        entry = {}
        entry['neighbors'] = [pids[ii] for ii in knn_matches[i]]
        entry['cluster'] = clusters[i]
        entry['similarity'] = sim[i,knn_matches[i]]
        sim_dict[pid] = entry
    return sim_dict

pp = predict_patients(pdata,sim_matrix)
pp

{'10': {'neighbors': ['10071',
   '10020',
   '10063',
   '10021',
   '156',
   '276',
   '183',
   '5058',
   '10108',
   '265',
   '10140',
   '256'],
  'cluster': 1,
  'similarity': array([0.81028553, 0.73103125, 0.72596919, 0.71922367, 0.71848363,
         0.71591294, 0.71044396, 0.70500703, 0.70494077, 0.70285459,
         0.702319  , 0.70076151])},
 '100': {'neighbors': ['10071',
   '10020',
   '10063',
   '156',
   '183',
   '256',
   '34',
   '229',
   '10021',
   '10018',
   '2016',
   '2007'],
  'cluster': 1,
  'similarity': array([0.82917492, 0.79839592, 0.78534809, 0.76834055, 0.7676517 ,
         0.76402827, 0.76386305, 0.76097392, 0.75997582, 0.75931892,
         0.7573404 , 0.7564199 ])},
 '10009': {'neighbors': ['10071',
   '262',
   '177',
   '5075',
   '276',
   '180',
   '10021',
   '10075',
   '5058',
   '154',
   '10154',
   '173'],
  'cluster': 1,
  'similarity': array([0.89053561, 0.75088344, 0.74994289, 0.74676606, 0.74353196,
         0.73869533, 0.73573765, 0.

In [90]:
Utils.np_dict_to_json({'patients':pp}, Const.organ_similarity_results_json,True)

True