In [1]:
from numpy.random import seed
seed(1)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from analysis import *
from collections import namedtuple
import Metrics
from PatientSet import PatientSet
from Constants import Constants
from dependencies.Boruta import BorutaPy

#for getting the fisher exact test
import rpy2.robjects.numpy2ri
from rpy2.robjects.packages import importr
rpy2.robjects.numpy2ri.activate()

#sklearn dependencies
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score, adjusted_rand_score
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.utils import resample
from scipy.cluster.hierarchy import fcluster, linkage

#we get like a million deprication errors for some reason with the external libraries
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

W0926 08:58:25.705495 10320 deprecation_wrapper.py:119] From F:\Skool\EVL_Research\CAMP-RT\PYTHON\PatientSet.py:8: The name tf.set_random_seed is deprecated. Please use tf.compat.v1.set_random_seed instead.

Using TensorFlow backend.
  data = yaml.load(f.read()) or {}


In [2]:
#class wrapper with for heirarchical clustering with more linkages than sklearn

def l1(x1, x2):
    return np.sum(np.abs(x1-x2))

def tanimoto_dist(x1, x2):
    if l1(x1 - x2) == 0:
        return 0
    tanimoto = x1.dot(x2)/(x1.dot(x1) + x2.dot(x2) - x1.dot(x2))
    return 1/(1+tanimoto)

def l2(x1, x2):
    return np.sqrt(np.sum((x1-x2)**2))

def pdist(x, dist_func):
    distance = []
    for i in range(x.shape[0]):
        for j in range(x.shape[0]):
            distance.append(dist_func(x[i], x[j]))
    return np.array(distance)

class FClusterer():
    
    def __init__(self, n_clusters, dist_func = l1, link = 'weighted', criterion = 'maxclust'):
        self.link = link
        self.dist_func = dist_func if link not in ['median', 'ward', 'centroid'] else 'euclidean'
        self.t = n_clusters
        self.criterion = criterion

    def fit_predict(self, x, y = None):
        clusters = linkage(x, method = self.link, metric = self.dist_func)
        return fcluster(clusters, self.t, criterion = self.criterion)

    

In [3]:
#functions for getting clusters
cluster_result = namedtuple('cluster_result', ['method', 'cluster', 'correlation','rand_score', 'model'])

def get_sortable_metric(c_result, metric):
    #function so we can use both metrics as parameters to pick clustering with 
    #we want a small correlation, but large rand score
    assert(metric in ['correlation', 'rand_score'])
    if metric == 'correlation':
        return c_result.correlation
    else:
        return -c_result.rand_score

def get_clusterers(ks = [2,5]):
    c_range = range(ks[0], ks[1])
    clusterers = {}
    clusterers['l1_weighted'] = [FClusterer(c) for c in c_range]
    clusterers['l2_weighted'] = [FClusterer(c, dist_func = l2) for c in c_range]
    clusterers['l1_complete'] = [FClusterer(c, link = 'complete') for c in c_range]
    clusterers['l2_complete'] = [FClusterer(c, dist_func = l2, link = 'complete') for c in c_range]
#     clusterers['centroid'] = [FClusterer(c, link='centroid') for c in c_range]
#     clusterers['median'] = [FClusterer(c, link = 'median') for c in c_range]
    clusterers['ward'] = [FClusterer(c, link='ward') for c in c_range]
    return clusterers

def fisher_exact_test(c_labels, y):
    if len(set(y)) == 1:
        print('fisher test run with no positive class')
        return 0
#        assert(len(set(y)) == 2)
    #call fishers test from r
    contingency = get_contingency_table(c_labels, y)
    stats = importr('stats')
    pval = stats.fisher_test(contingency)[0][0]
    return pval

def get_contingency_table(x, y):
    #assumes x and y are two equal length vectors, creates a mxn contigency table from them
    cols = sorted(list(np.unique(y)))
    rows = sorted(list(np.unique(x)))
    tabel = np.zeros((len(rows), len(cols)))
    for row_index in range(len(rows)):
        row_var = rows[row_index]
        for col_index in range(len(cols)):
            rowset = set(np.argwhere(x == row_var).ravel())
            colset = set(np.argwhere(y == cols[col_index]).ravel())
            tabel[row_index, col_index] = len(rowset & colset)
    return tabel

def analyze_clusters(target_var, name, clusterer, features, metric = 'correlation'):
    clusters = clusterer.fit_predict(features).ravel()
    n_clusters = len(set(clusters))
    if n_clusters < 2:
        return None
    method = name + str(n_clusters)

    overall_correlation = fisher_exact_test(clusters, target_var)
    rand_score = adjusted_rand_score(clusters, target_var) 
    result = cluster_result(method, 
                            'all',
                            overall_correlation,
                            rand_score,
                            clusterer)
    return result

def cluster(target_var, features, metric = 'correlation', args = None):
    if args is not None:
        assert( isinstance(args, list) )
        features = features[:, args]
    results = []
    clusterers = get_clusterers()
    for cname, clusterers in clusterers.items():
        for clusterer in clusterers:
            analysis = analyze_clusters(target_var, cname, clusterer, features, metric)
            if analysis is not None:
                results.append(analysis)
    results = sorted(results, key = lambda x: get_sortable_metric(x,metric))
    return results

def get_optimal_clustering(features, target_var, 
                           metric = 'correlation',
                           args = None, 
                           patient_subset = None):
    clusters = np.zeros(target_var.shape)
    if patient_subset is not None:
        target = target_var[patient_subset]
        features = features[patient_subset,:]
    else:
        target = target_var
    result = cluster(target, features,  metric, args)
    if args is not None:
        features = features[:, args]
    clusters[patient_subset] = result[0].model.fit_predict(features).ravel() + 1
    pval = fisher_exact_test(clusters, target_var)
    rand_score = adjusted_rand_score(clusters, target_var)
    clusterer_data = cluster_result(method = result[0].method,
                                    cluster = result[0].cluster,
                                    correlation = pval,
                                    rand_score = rand_score,
                                    model = result[0].model)
    optimal = (clusters, clusterer_data)
    return optimal


In [4]:
#load in the patientset object that has all the patient info
db = PatientSet()

#add a bunch of features to the object that we'll want to try
#so we can use the db.to_dataframe function to get them all in a nice dataframe with one-hot encoding and labels automatically
db.discrete_dists = Metrics.discretize(-db.tumor_distances, n_bins = 15, strategy='uniform')
db.discrete_volumes = Metrics.discretize(db.volumes, n_bins = 15, strategy='uniform')
db.t_volumes = np.array([np.sum([g.volume for g in gtvs]) for gtvs in db.gtvs]).reshape(-1,1)
db.tsimdoses = tsim_prediction(db)
db.toxicity = db.feeding_tubes + db.aspiration > 0
db.xerostima = db.feeding_tubes + db.aspiration > 1

  mean_tumor_distances /= tumor_volume
  tumor_position /= tumor_volume


100 [{0, 1}, {2}]
128 [{0}, {1, 2, 3, 4}]
notation not accounted for in lymph nodes: R3/R4
notation not accounted for in lymph nodes: L2/3
notation not accounted for in lymph nodes: R3/4
notation not accounted for in lymph nodes: R2/3
notation not accounted for in lymph nodes: R2-R4
notation not accounted for in lymph nodes: L2/3
notation not accounted for in lymph nodes: L2/3
notation not accounted for in lymph nodes: L2/3
notation not accounted for in lymph nodes: R2/3
notation not accounted for in lymph nodes: R2/3/4
notation not accounted for in lymph nodes: L2/3
notation not accounted for in lymph nodes: L2/3
notation not accounted for in lymph nodes: R2/3
notation not accounted for in lymph nodes: R2/3
notation not accounted for in lymph nodes: R2/3
10021 [{0, 1}, {2}]
error reading tumor volume for  10041
10074 [{0, 1}, {2}]
error reading tumor volume for  10091
error reading tumor volume for  10148
10191 [{0, 1}, {2}]


W0926 08:58:54.584146 10320 deprecation_wrapper.py:119] From F:\Anaconda\lib\site-packages\keras\backend\tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0926 08:58:54.585146 10320 deprecation_wrapper.py:119] From F:\Anaconda\lib\site-packages\keras\backend\tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0926 08:58:54.589147 10320 deprecation_wrapper.py:119] From F:\Anaconda\lib\site-packages\keras\backend\tensorflow_backend.py:133: The name tf.placeholder_with_default is deprecated. Please use tf.compat.v1.placeholder_with_default instead.

W0926 08:58:54.596147 10320 deprecation_wrapper.py:119] From F:\Anaconda\lib\site-packages\keras\backend\tensorflow_backend.py:4115: The name tf.random_normal is deprecated. Please use tf.random.normal instead.

W0926 08:58:54.683152 10320 deprecation_wrapper.py:119] From F:\Anaconda\lib\site-packages\keras\opt


patient data loaded...



In [18]:
#parameters for the experiments
toxicities_to_test = ['feeding_tubes', 'aspiration', 'toxicity']

#features to test the feature selection on.  should be fields in the patientset we have
candidate_features = ['tumor_distances', 'volumes', 't_volumes', 
                      'lateralities', 'subsites']

#number of times to resample and doing feature selection
#if n = 1, just use the first result
n_samples = 100

#type of scaling to use
scaler = MinMaxScaler()

#put some bounds on the features to subset
min_features = 2

#class used to subset the data, default is what the original paper suggests, roughly
boruta = BorutaPy(RandomForestClassifier(n_estimators = 500, max_depth = 10), n_estimators = 1000)

#where to save results, put None if you don't want to save
save_root = 'data/clustering_results/'

#how to decide which clustering method is best.  shoud be 'rand_score' or 'correlation'
cluster_metric = 'correlation'

In [None]:
#our actual experiment, try to find correlations using the boruta method and such
feature_list = []
for tox_name in toxicities_to_test:
    print(tox_name)
    toxicity = getattr(db, tox_name) > 0
    
    #use actual doses to train, and predicted for the clustering 
    train = db.to_dataframe(candidate_features + ['doses'], merge_mirrored_organs = True)
    test = db.to_dataframe(candidate_features + ['tsimdoses'], merge_mirrored_organs = True)
    
    #we're going to resample the data, scale it, and apply the boruta method n_sample times
    def get_resampled_support(x, y):
        if n_samples > 1:
            x, y = resample(x.values, y)
        x = scaler.fit_transform(x)
        boruta.fit(x, y)
        return boruta.support_, boruta.support_weak_
    
    #save the boruta support for each trial in a dataframe, scores are % of time the variable has support or weak support
    supports = pd.DataFrame(data = np.zeros((2,train.shape[1])), columns = test.columns, index =['support', 'weak_support'])
    for n in range(n_samples):
        sup, weak_sup = get_resampled_support(train, toxicity)
        supports.loc['support'] += sup/n_samples
        supports.loc['weak_support'] += weak_sup/n_samples
        
    #try out a bunch of thresholds on how good the variable is supported vs cluster results
    best_score = 1
    prev_argcount = test.shape[1]
    for support_thresh in [.2,.3,.4,.5,.6,.7,.8,.9]:
        top_args = np.argwhere(supports.loc['support'] >= support_thresh).ravel()
        if len(top_args) < min_features:
            break
        #check to see that we actually added more features
        if len(top_args) == prev_argcount:
            continue
        prev_argcount = len(top_args)
        to_use = test.iloc[:, top_args]
        
        print('number of features: ', to_use.shape[1])
        
        #we're going to try a bunch of different clusterings and look at the best result
        clustering = get_optimal_clustering(scaler.fit_transform(to_use.values), 
                                            toxicity,
                                            metric = cluster_metric)
        print(clustering[1].method)
        print(get_contingency_table(clustering[0], toxicity))
        print('correlation: ', clustering[1].correlation)
        print('rand score: ', clustering[1].rand_score,'\n')
        #save the feature set with the best (lowest) correlation
        #bias it towards fewer features
        if get_sortable_metric(clustering[1],cluster_metric) < best_score/5:
            best_score = get_sortable_metric(clustering[1],cluster_metric)
            best_clusters = clustering[0]
            best_features = copy(to_use)
            best_results = clustering[1]
            n_best_clusters = len(set(clustering[0]))
            
    #check that we actually got a result
    if best_score == 1:
        print('no good values')
        continue
    best_features['cluster_labels'] = best_clusters
    print(best_features.columns)
    best_features.index.rename('Dummy.ID', inplace = True)
    feature_list.append(best_features)
    if save_root is not None:
        best_features.to_csv(save_root
                     + 'boruta_features_k='
                     + str(n_best_clusters)
                     + '_p=' + '{:.3e}'.format(best_results.correlation)
                     + '_toxicity=' + tox_name + '.csv')

feeding_tubes


In [None]:
#get all the features found before and put them together
combined_df = feature_list[0]
for i in range(1, len(feature_list)-1):
    df2 = feature_list[i]
    to_drop = list(set(combined_df.columns).intersection(set(df2.columns)))
    if len(to_drop) == df2.shape[1]:
        continue
    df2 = df2.drop(to_drop, axis = 1)
    combined_df = pd.merge(combined_df, df2, on = 'Dummy.ID')
combined_df.drop('cluster_labels', axis = 1, inplace = True)
print(combined_df.columns)
combined_clusters = get_optimal_clustering(scaler.fit_transform(combined_df.values), db.toxicity)
print(combined_clusters[1].method)
print(get_contingency_table(combined_clusters[0], toxicity))
print('correlation: ', combined_clusters[1].correlation)
print('rand score: ', combined_clusters[1].rand_score, '\n')
combined_df.cluster_labels = combined_clusters[0]

In [None]:
if save_root is not None:
    best_features.to_csv(save_root
                 + 'combinedClusteringWithNonspatial.csv')