In [2]:
from numpy.random import seed
seed(1)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from analysis import *
from collections import namedtuple
import Metrics
from PatientSet import PatientSet
from Constants import Constants
from dependencies.Boruta import BorutaPy
from Clustering import *
import re

#sklearn dependencies
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score, adjusted_rand_score
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.utils import resample
from sklearn.cluster import FeatureAgglomeration

#we get like a million deprication errors for some reason with the external libraries
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

Using TensorFlow backend.


In [3]:
#load in the patientset object that has all the patient info
db = PatientSet()

#add a bunch of features to the object that we'll want to try
#so we can use the db.to_dataframe function to get them all in a nice dataframe with one-hot encoding and labels automatically
db.t_volumes = np.array([np.sum([g.volume for g in gtvs]) for gtvs in db.gtvs]).reshape(-1,1)
db.tsimdoses = tsim_prediction(db)
db.bilateral = db.lateralities == 'B'
db.total_volumes = db.volumes.sum(axis = 1)
db.toxicity = db.feeding_tubes + db.aspiration > 0
db.xerostima = db.feeding_tubes + db.aspiration > 1

  tumor_position /= tumor_volume
  except:


error reading tumor volume for  10091
error reading tumor volume for  10148
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.

patient data loaded...



In [96]:
#parameters for the experiments
toxicities_to_test = ['toxicity']

#features to test the feature selection on.  should be fields in the patientset we have
#we don't cluster on these
unclusterable_features = ['t_volumes', 'bilateral', 'total_volumes']
#we cluster on these (each individually) if feature_clustering is defined,
clusterable_features = ['tumor_distances', 'volumes']

#features specifically for feature selection vs actually using.  Should either be
#some combo of actual and predicted dose for this
train_features = ['tsimdoses']
test_features = ['tsimdoses']

#number of times to resample and doing feature selection
#if n = 1, just use the first result
n_samples = 1

df_rescale =lambda x: (x - x.mean(axis = 0))/x.std(axis = 0)#None 

#put some bounds on the features to subset
min_features = 2

#for now just constrain it to one cluster
n_clusters = 4

#make this none if you don't want to cluster the features
#otherwise give a number of clusters to group them, 
#should be <33 as of writting this as that is the total included number of organs
feature_clustering = None

#where to save results, put None if you don't want to save
save_root = 'data/clustering_results/'

#how to decide which clustering method is best.  shoud be 'rand_score' or 'correlation'
cluster_metric = 'correlation'

In [101]:
%load_ext autoreload
%autoreload 2
from Clustering import *
#our actual experiment, try to find correlations using the boruta method and such
feature_list = []
for tox_name in toxicities_to_test:
    print(tox_name)
    toxicity = getattr(db, tox_name) > 0
    
    #remove eyeball stuff from the candidates since those are missing in some patients
    #and it messes up the feature selection due to that noise
    organs = copy(Constants.organ_list)
    for o in Constants.organ_list:
        if re.search('Eyeball', o) is not None:
            organs.remove(o)
    
    train, test = get_train_test_datasets(db, unclusterable_features, 
                                          clusterable_features, 
                                          train_features, 
                                          test_features,
                                         organs,
                                         feature_clustering)
    
    if df_rescale is not None:
        train = df_rescale(train)
        test = df_rescale(test)
    selection_clusterer  = FClusterer(n_clusters)
    feature_selector = FeatureClusterSelector(
        n_samples = n_samples,
        clusterer = selection_clusterer).fit(train, toxicity)
    to_use = feature_selector.transform(test)
    labels = feature_selector.predict_labels(train, toxicity)
    print()
    print()
    print(get_contingency_table(labels, toxicity))
    print(fisher_exact_test(labels, toxicity))
    print('number of features: ', to_use.shape[1])

    #we're going to try a bunch of different clusterings and look at the best result
    clustering = get_optimal_clustering(to_use.values, 
                                        toxicity,
                                        metric = cluster_metric,
                                       min_clusters = n_clusters,
                                       max_clusters = n_clusters)
    print(clustering[1].method)
    print(get_contingency_table(clustering[0], toxicity))
    print('correlation: ', clustering[1].correlation)
    print('rand score: ', clustering[1].rand_score,'\n')

    to_use['cluster_labels'] = clustering[0]
#     print(best_features.columns)
    to_use.index.rename('Dummy.ID', inplace = True)
    feature_list.append(to_use)
    if save_root is not None:
        n_best_clusters = len(set(clustering[0]))
        to_use.to_csv(save_root
                     + 'boruta_features_k='
                     + str(n_best_clusters)
                     + '_p=' + '{:.3e}'.format(clustering[1].correlation)
                     + '_toxicity=' + tox_name + '.csv')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
toxicity
       Soft_Palate_tsimdoses  bilateral
3                   0.231651  -0.313698
4                  -1.239182  -0.313698
10                 -0.414131  -0.313698
11                 -0.781298  -0.313698
27                 -0.315371  -0.313698
29                 -1.380026  -0.313698
31                  0.670126  -0.313698
33                  0.867378   3.171838
34                  0.905737  -0.313698
35                  0.809784  -0.313698
36                  0.353946  -0.313698
37                  0.532103  -0.313698
41                 -1.035127  -0.313698
46                 -0.375295   3.171838
49                 -0.614599  -0.313698
100                 0.849090  -0.313698
102                -1.021740  -0.313698
104                -0.863371  -0.313698
105                 1.101063  -0.313698
109                -1.351970  -0.313698
110                 0.623405   3.171838
112                 0.4

['Soft_Palate_tsimdoses', 'bilateral', 'Hyoid_bone_tumor_distances']


[[58.  3.]
 [98. 23.]
 [ 4.  2.]
 [ 6.  6.]]
0.0004990744539961271
number of features:  3
Kmeans4
[[91.  5.]
 [29. 20.]
 [10.  8.]
 [36.  1.]]
correlation:  1.572458184797956e-09
rand score:  0.1240217399244526 



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [93]:
%load_ext autoreload
%autoreload 2
from Clustering import *

#get all the features found before and put them together
combined_df = feature_list[0]
if len(feature_list) > 1:
    for i in range(1, len(feature_list)):
        df2 = feature_list[i]
        to_drop = list(set(combined_df.columns).intersection(set(df2.columns)))
        if len(to_drop) == df2.shape[1]:
            continue
        df2 = df2.drop(to_drop, axis = 1)
        combined_df = pd.merge(combined_df, df2, on = 'Dummy.ID')
combined_df.drop('cluster_labels', axis = 1, inplace = True)
print(combined_df.columns)
combined_clusters = get_optimal_clustering(combined_df.values, db.toxicity,
                                    metric = cluster_metric,
                                    min_clusters = 2,
                                    max_clusters = 2)
print(combined_clusters[1].method)
print(get_contingency_table(combined_clusters[0], toxicity))
print('correlation: ', combined_clusters[1].correlation)
print('rand score: ', combined_clusters[1].rand_score, '\n')
combined_df['cluster_labels'] = combined_clusters[0]

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Index(['Masseter_M_tsimdoses_combined', 'Larynx_tumor_distances',
       'Soft_Palate_volumes', 'IPC_volumes'],
      dtype='object')


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


Kmeans4
[[23. 24.]
 [35.  4.]
 [33.  3.]
 [75.  3.]]
correlation:  4.017424240352255e-10
rand score:  0.08930213994038591 



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [None]:
if save_root is not None:
    combined_df.to_csv(save_root
                 + 'forwardSelectionClustering2.csv')

In [99]:
labels = FClusterer(4).fit_predict(to_use.values)
print(get_contingency_table(labels.ravel(), toxicity))

[[10.  8.]
 [36.  1.]
 [29. 20.]
 [91.  5.]]


In [87]:
to_use

Unnamed: 0_level_0,Soft_Palate_tsimdoses,bilateral,Hyoid_bone_tumor_distances,cluster_labels
Dummy.ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3,0.231651,-0.313698,1.626802,2.0
4,-1.239182,-0.313698,0.125167,3.0
10,-0.414131,-0.313698,-1.518801,3.0
11,-0.781298,-0.313698,-0.618564,3.0
27,-0.315371,-0.313698,-0.618564,3.0
29,-1.380026,-0.313698,0.145952,3.0
31,0.670126,-0.313698,1.352461,2.0
33,0.867378,3.171838,0.748059,5.0
34,0.905737,-0.313698,1.966884,2.0
35,0.809784,-0.313698,0.938668,2.0
