In [190]:
from numpy.random import seed
seed(1)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from analysis import *
from collections import namedtuple
import Metrics
from PatientSet import PatientSet
from Constants import Constants
from dependencies.Boruta import BorutaPy
from Clustering import *
import re

#sklearn dependencies
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score, adjusted_rand_score
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.utils import resample
from sklearn.cluster import FeatureAgglomeration

#we get like a million deprication errors for some reason with the external libraries
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [3]:
#load in the patientset object that has all the patient info
db = PatientSet()

#add a bunch of features to the object that we'll want to try
#so we can use the db.to_dataframe function to get them all in a nice dataframe with one-hot encoding and labels automatically
db.discrete_dists = Metrics.discretize(-db.tumor_distances, n_bins = 15, strategy='uniform')
db.discrete_volumes = Metrics.discretize(db.volumes, n_bins = 15, strategy='uniform')
db.t_volumes = np.array([np.sum([g.volume for g in gtvs]) for gtvs in db.gtvs]).reshape(-1,1)
db.tsimdoses = tsim_prediction(db)
db.toxicity = db.feeding_tubes + db.aspiration > 0
db.xerostima = db.feeding_tubes + db.aspiration > 1

  mean_tumor_distances /= tumor_volume
  tumor_position /= tumor_volume


100 [{0, 1}, {2}]
128 [{0}, {1, 2, 3, 4}]
notation not accounted for in lymph nodes: R3/R4
notation not accounted for in lymph nodes: L2/3
notation not accounted for in lymph nodes: R3/4
notation not accounted for in lymph nodes: R2/3
notation not accounted for in lymph nodes: R2-R4
notation not accounted for in lymph nodes: L2/3
notation not accounted for in lymph nodes: L2/3
notation not accounted for in lymph nodes: L2/3
notation not accounted for in lymph nodes: R2/3
notation not accounted for in lymph nodes: R2/3/4
notation not accounted for in lymph nodes: L2/3
notation not accounted for in lymph nodes: L2/3
notation not accounted for in lymph nodes: R2/3
notation not accounted for in lymph nodes: R2/3
notation not accounted for in lymph nodes: R2/3
10021 [{0, 1}, {2}]
10074 [{0, 1}, {2}]
error reading tumor volume for  10091
error reading tumor volume for  10148
10191 [{0, 1}, {2}]
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:

In [205]:
%reload_ext Clustering

#parameters for the experiments
toxicities_to_test = ['toxicity']

#features to test the feature selection on.  should be fields in the patientset we have
#we don't cluster on these
db.bilateral = db.lateralities == 'B'
db.total_volumes = db.volumes.sum(axis = 0)
unclusterable_features = ['t_volumes', 'bilateral', 'total_volumes']
#we cluster on these (each individually) if feature_clustering is defined,
clusterable_features = ['tumor_distances', 'volumes']
#number of times to resample and doing feature selection
#if n = 1, just use the first result
n_samples = 1

#type of scaling to use
scaler = MinMaxScaler()

#put some bounds on the features to subset
min_features = 2

#make this none if you don't want to cluster the features
#otherwise give a number of clusters to group them, 
#should be <33 as of writting this as that is the total included number of organs
feature_clustering = None

#class used to subset the data, default is what the original paper suggests, roughly
boruta = BorutaPy(ExtraTreesClassifier(n_estimators = 600), n_estimators = 2000)

#where to save results, put None if you don't want to save
save_root = 'data/clustering_results/'

#how to decide which clustering method is best.  shoud be 'rand_score' or 'correlation'
cluster_metric = 'correlation'

In [300]:
%reload_ext Clustering
from Clustering import *

In [301]:
#our actual experiment, try to find correlations using the boruta method and such
feature_list = []
supports_dict = {}
for tox_name in toxicities_to_test:
    print(tox_name)
    toxicity = getattr(db, tox_name) > 0
    
    #remove eyeball stuff from the candidates since those are missing in some patients
    #and it messes up the feature selection due to that noise
    organs = copy(Constants.organ_list)
    for o in Constants.organ_list:
        if re.search('Eyeball', o) is not None:
            organs.remove(o)
    
    base = db.to_dataframe(unclusterable_features)
    for f in clusterable_features:
        temp_data = db.to_dataframe([f], 
                               merge_mirrored_organs = True, 
                               organ_list = organs)
        if feature_clustering is not None:
            temp_data = FeatureClusterer(feature_clustering).fit_predict(temp_data)
        base = base.join(temp_data, how = 'inner')
    
    train = db.to_dataframe(['doses'],
                           merge_mirrored_organs = True,
                           organ_list = organs)
    test = db.to_dataframe(['tsimdoses'],
                           merge_mirrored_organs = True,
                           organ_list = organs)
    if feature_clustering is not None:
        fc = FeatureClusterer(feature_clustering)
        train = fc.fit_predict(train)
        test = fc.predict(test)
    train = train.join(base, how = 'inner')
    test = test.join(base, how = 'inner')
    feature_selector = FeatureClusterSelector(n_samples = n_samples).fit(train, toxicity)
    to_use = feature_selector.transform(test)

    print(to_use.columns)
        
    print('number of features: ', to_use.shape[1])

    #we're going to try a bunch of different clusterings and look at the best result
    clustering = get_optimal_clustering(to_use.values, 
                                        toxicity,
                                        metric = cluster_metric)
    print(clustering[1].method)
    print(get_contingency_table(clustering[0], toxicity))
    print('correlation: ', clustering[1].correlation)
    print('rand score: ', clustering[1].rand_score,'\n')

    to_use['cluster_labels'] = clustering[0]
#     print(best_features.columns)
    to_use.index.rename('Dummy.ID', inplace = True)
    feature_list.append(to_use)
    if save_root is not None:
        best_features.to_csv(save_root
                     + 'boruta_features_k='
                     + str(n_best_clusters)
                     + '_p=' + '{:.3e}'.format(best_results.correlation)
                     + '_toxicity=' + tox_name + '.csv')

toxicity
['Parotid_Gland_tsimdoses_combined', 'Genioglossus_M_tumor_distances']
0.9999999998591886

['Parotid_Gland_tsimdoses_combined', 'Genioglossus_M_tumor_distances', 'Lateral_Pterygoid_M_tsimdoses_combined']
0.9999999999881237

['Parotid_Gland_tsimdoses_combined', 'Genioglossus_M_tumor_distances', 'Lateral_Pterygoid_M_tsimdoses_combined', 'MPC_volumes']
0.9999999999985696

['Parotid_Gland_tsimdoses_combined', 'Genioglossus_M_tumor_distances', 'Lateral_Pterygoid_M_tsimdoses_combined', 'MPC_volumes', 'Masseter_M_tsimdoses_combined']
0.99999999999954

['Parotid_Gland_tsimdoses_combined', 'Genioglossus_M_tumor_distances', 'Lateral_Pterygoid_M_tsimdoses_combined', 'MPC_volumes', 'Masseter_M_tsimdoses_combined']
Index(['Parotid_Gland_tsimdoses_combined', 'Genioglossus_M_tumor_distances',
       'Lateral_Pterygoid_M_tsimdoses_combined', 'MPC_volumes',
       'Masseter_M_tsimdoses_combined'],
      dtype='object')
number of features:  5
ward4
[[110.   8.]
 [ 24.  24.]
 [  0.   1.]
 [ 32. 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [306]:
#get all the features found before and put them together
combined_df = feature_list[0]
if len(feature_list) > 1:
    for i in range(1, len(feature_list)):
        df2 = feature_list[i]
        to_drop = list(set(combined_df.columns).intersection(set(df2.columns)))
        if len(to_drop) == df2.shape[1]:
            continue
        df2 = df2.drop(to_drop, axis = 1)
        combined_df = pd.merge(combined_df, df2, on = 'Dummy.ID')
combined_df.drop('cluster_labels', axis = 1, inplace = True)
print(combined_df.columns)
combined_clusters = get_optimal_clustering(combined_df.values, db.toxicity)
print(combined_clusters[1].method)
print(get_contingency_table(combined_clusters[0], toxicity))
print('correlation: ', combined_clusters[1].correlation)
print('rand score: ', combined_clusters[1].rand_score, '\n')
combined_df['cluster_labels'] = combined_clusters[0]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


Index(['Parotid_Gland_tsimdoses_combined', 'Genioglossus_M_tumor_distances',
       'Lateral_Pterygoid_M_tsimdoses_combined', 'MPC_volumes',
       'Masseter_M_tsimdoses_combined'],
      dtype='object')
ward4
[[110.   8.]
 [ 24.  24.]
 [  0.   1.]
 [ 32.   1.]]
correlation:  6.579045953538654e-11
rand score:  0.17806851952526054 



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [307]:
if save_root is not None:
    combined_df.to_csv(save_root
                 + 'forwardSelectionClustering.csv')

PermissionError: [Errno 13] Permission denied: 'data/clustering_results/forwardSelectionClustering.csv'

In [305]:
fisher_exact_test(AgglomerativeClustering(4).fit_predict(combined_df.values), db.toxicity)

0.0004257957682429409