In [11]:
#automatically reload stuff
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [12]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import Utils 
from Constants import Const
from SpatialPreprocessing import *
import Models
import Cluster
import Formatting

In [13]:
pdata = load_pdict(Const.denoised_organ_json)
pdata.keys()

dict_keys(['organs', 'patient_ids', 'patients', 'mask'])

In [15]:
def denoised_pdict_to_array(dpd,key,organ_list=None,pids=None):
    patients = dpd['patients']
    if organ_list is None:
        organ_list = dpd['organs']
    elif not Utils.iterable(organ_list):
        organ_list = [organ_list]
    if pids is None:
        pids = dpd['patient_ids']
    array = []
    for pid in pids:
        row = []
        pentry = patients.get(pid)
        for organ in organ_list:
            odata = pentry.get(organ)
            if odata is None:
                row.append(0)
            else:
                oval = odata.get(key)
                row.append(oval)
        array.append(row)
    return np.array(array)

volumes = denoised_pdict_to_array(pdata,'volume')
distances = denoised_pdict_to_array(pdata,'distances')
denoised_pdict_to_array(pdata,'distances',organ_list =['gtv_composite'])

array([[[ 56.81325993,  36.00990893,  38.80793388, ...,  67.72743251,
          75.11925919,  36.88409443]],

       [[ 57.88409075,  34.71454093,  46.24173921, ...,  91.74994817,
         113.68383478,  48.53422978]],

       [[ 59.91522982,  28.70163846,  53.32288273, ...,  64.1328949 ,
          69.95318288,  38.89681166]],

       ...,

       [[ 60.125     ,  24.3125    ,  32.53125   , ...,  31.859375  ,
          45.65625   ,  27.28125   ]],

       [[ 62.53125   ,  25.875     ,  27.28125   , ...,  73.5625    ,
          78.5       ,  36.09375   ]],

       [[ 27.34375   ,  36.8125    ,  51.84375   , ...,  88.875     ,
          94.5       ,  35.90625   ]]])

In [16]:
#takes in a dictionary of arrays (as in the DataInputer output
#returns a similarity matrix 0-1
#n_jobs > 1 will try to multithread
sim = Models.TssimSimilarity(n_jobs = 4)
sim_matrix = sim.get_similarity_matrix(distances,volumes)
sim_matrix

comparing patient 219 and patient 220: 0.70617113044710435

array([[1.        , 0.65394267, 0.76266821, ..., 0.73139217, 0.74209809,
        0.69892078],
       [0.65394267, 1.        , 0.76223605, ..., 0.75814083, 0.78446825,
        0.74722115],
       [0.76266821, 0.76223605, 1.        , ..., 0.68211828, 0.65788359,
        0.60410837],
       ...,
       [0.73139217, 0.75814083, 0.68211828, ..., 1.        , 0.71237677,
        0.66519347],
       [0.74209809, 0.78446825, 0.65788359, ..., 0.71237677, 1.        ,
        0.77107116],
       [0.69892078, 0.74722115, 0.60410837, ..., 0.66519347, 0.77107116,
        1.        ]])

In [17]:
clusters = Cluster.SimilarityClusterer(4).fit_predict(sim_matrix)
clusters

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 2, 1, 2, 2, 1, 2, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2,
       2, 2, 1, 1, 2, 2, 2, 2, 1, 2, 1, 3, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2,
       2, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2,
       2, 2, 1, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 1, 2, 2,
       2, 3, 2, 2, 1, 2, 2, 1, 2, 2, 2, 2, 2, 2, 3, 1, 3, 2, 3, 3, 2, 2,
       3, 2, 3, 2, 3, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 3, 3, 3,
       2, 3, 2, 3, 3, 4, 3, 3, 3, 3, 1, 3, 3, 3, 3, 4, 4, 3, 3, 3, 4, 4,
       3, 3, 4, 3, 4, 4, 4, 3, 3, 3, 4, 4, 3, 3, 3, 3, 3, 2, 4, 4, 4, 3,
       3, 4, 4, 4, 3, 4, 4, 3, 3, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4,
       4], dtype=int32)

In [18]:
#takes a similarity matrix and returns a list of the indices of the nearest neighbors
#in order
knn = Models.PatientKNN(match_type = 'default')
knn_matches = knn.get_matches(sim_matrix)
knn_matches

[array([  9,  92, 166,  19,  24]),
 array([182,  20,   9,  99, 160]),
 array([ 22, 104, 211,  23,   9]),
 array([ 9, 92, 20, 19, 99]),
 array([ 22,   9,  92, 166,  24]),
 array([ 22,   4, 104, 211,  23]),
 array([ 4,  3, 22,  1,  9]),
 array([  9,  92,  22, 166,  19]),
 array([ 7,  4, 22,  3,  1]),
 array([ 7,  4,  3, 22,  5]),
 array([7, 4, 3, 1, 0]),
 array([ 92, 166,  24,  19,  33]),
 array([ 22, 166,  92,  24,  33]),
 array([ 92, 166,  24,  19,   7]),
 array([ 22, 166,  92,  24,  33]),
 array([ 22,   7,  12, 166,  92]),
 array([ 7, 12,  4, 14, 11]),
 array([ 22,  12, 211, 166, 104]),
 array([ 7, 12,  4, 14, 11]),
 array([12,  7, 22,  4, 14]),
 array([ 7, 12,  4, 14, 22]),
 array([ 22, 166,  92,  24, 211]),
 array([17,  5,  2, 14, 12]),
 array([ 12, 104, 164,  17,  14]),
 array([12,  7, 14,  4, 21]),
 array([ 92, 166,  33, 109, 205]),
 array([ 12,   7, 211, 104, 166]),
 array([182,  99, 160,  45, 122]),
 array([ 92,   7, 166,  33, 109]),
 array([ 7, 25, 12,  4, 14]),
 array([ 92, 16

In [19]:
def predict_patients(dpd,sim = None, 
                    n_clusters = 4, 
                    n_match_bounds = [5,20], 
                    match_threshold = .95
                   ):
    if sim is None:
        volumes = denoised_pdict_to_array(pdata,'volume')
        distances = denoised_pdict_to_array(pdata,'distances')
        sim = Models.TssimSimilarity(n_jobs = 4).get_similarity_matrix(distances,volumes)
    pids = dpd['patient_ids']
    
    clusterer = Cluster.SimilarityClusterer(n_clusters)
    clusters = clusterer.fit_predict(sim)
    
    knn = Models.PatientKNN(match_type = 'default', 
                          n_match_bounds = n_match_bounds,
                          match_threshold = match_threshold)
    knn_matches = knn.get_matches(sim_matrix)
    
    sim_dict = {}
    for i,pid in enumerate(pids):
        entry = {}
        entry['neighbors'] = [pids[ii] for ii in knn_matches[i]]
        entry['cluster'] = clusters[i]
        entry['similarity'] = sim[i,knn_matches[i]]
        sim_dict[pid] = entry
    return sim_dict

pp = predict_patients(pdata,sim_matrix)
pp

{'10': {'neighbors': ['10021',
   '156',
   '265',
   '10063',
   '10077',
   '10065',
   '10108',
   '183',
   '5056',
   '10143',
   '5058',
   '256'],
  'cluster': 1,
  'similarity': array([0.79261372, 0.7848294 , 0.77923684, 0.77754466, 0.77635616,
         0.77122628, 0.76912533, 0.76877474, 0.76829992, 0.76758545,
         0.76660976, 0.76629136])},
 '100': {'neighbors': ['34',
   '10065',
   '10021',
   '172',
   '256',
   '10138',
   '10014',
   '156',
   '10063',
   '234',
   '212',
   '2016'],
  'cluster': 1,
  'similarity': array([0.80297476, 0.8000019 , 0.79904792, 0.7989971 , 0.79592256,
         0.79552845, 0.79530444, 0.79342736, 0.79266215, 0.79204661,
         0.79204125, 0.79030746])},
 '10009': {'neighbors': ['10071',
   '177',
   '5075',
   '10075',
   '10021',
   '265',
   '276',
   '262',
   '173',
   '5058',
   '10108',
   '10077'],
  'cluster': 1,
  'similarity': array([0.92752487, 0.80088803, 0.80059499, 0.79932913, 0.79175436,
         0.79063653, 0.78807246, 

In [21]:
Utils.np_dict_to_json({'patients':pp}, Const.organ_similarity_results_json,True)

True