In [1]:
from numpy.random import seed
seed(1)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from analysis import *
from collections import namedtuple
import Metrics
from PatientSet import PatientSet
from Constants import Constants
from Clustering import *
import re

#sklearn dependencies
from sklearn.metrics import roc_auc_score, adjusted_rand_score
from sklearn.utils import resample
from sklearn.cluster import FeatureAgglomeration

#we get like a million deprication errors for some reason with the external libraries
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)




Using TensorFlow backend.


In [2]:
#plotting parameter
SMALL_SIZE = 18
MEDIUM_SIZE = 20
BIGGER_SIZE = 24
FIG_SIZE = (20,15)

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title
plt.rc('figure', figsize=FIG_SIZE)

In [3]:
#load in the patientset object that has all the patient info
db = PatientSet()

#add a bunch of features to the object that we'll want to try
#so we can use the db.to_dataframe function to get them all in a nice dataframe with one-hot encoding and labels automatically
db.t_volumes = np.array([np.sum([g.volume for g in gtvs]) for gtvs in db.gtvs]).reshape(-1,1)
db.bilateral = db.lateralities == 'B'
db.total_volumes = db.volumes.sum(axis = 1)
db.toxicity = db.feeding_tubes + db.aspiration > 0
db.tsimdoses = tsim_prediction(db)
db.neck_width = np.linalg.norm(db.centroids[:,Constants.organ_list.index('Lt_Sternocleidomastoid_M'),:] - db.centroids[:,Constants.organ_list.index('Rt_Sternocleidomastoid_M'), :], axis = 1)

  mean_tumor_distances /= tumor_volume
  tumor_position /= tumor_volume


error reading tumor volume for  10091
error reading tumor volume for  10148






patient data loaded...



In [7]:
#parameters for the experiments
toxicities_to_test = ['toxicity']

#features to test the feature selection on.  should be fields in the patientset we have
#we don't cluster on these
unclusterable_features = ['t_volumes', 'bilateral', 'total_volumes','neck_width']
#we cluster on these (each individually) if feature_clustering is defined,
clusterable_features = ['tumor_distances', 'volumes']

#features specifically for feature selection vs actually using.  Should either be
#some combo of actual and predicted dose for this
true_features = ['doses']
predicted_features = ['tsimdoses']

#number of times to resample and doing feature selection
#if n = 1, just use the first result
n_samples = 500

df_rescale = Metrics.normalize

#put some bounds on the features to subset
min_features = 2

#for now just constrain it to one cluster
n_clusters = 2
selection_clusterer  = FClusterer(n_clusters)
cluster_feature_selector = FeatureClusterSelector(
    n_samples = n_samples,
    model = selection_clusterer)
lg_feature_selector = FeatureSelector(n_samples = n_samples)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
true, predicted = get_train_test_datasets(db, 
                                      unclusterable_features, 
                                      clusterable_features, 
                                      true_features, 
                                      predicted_features)
if df_rescale is not None:
    true = df_rescale(true)
    predicted = df_rescale(predicted)

In [None]:
cluster_importances = cluster_feature_selector.get_importances(predicted, db.toxicity, as_df = True)
cluster_importances.describe()

In [None]:
lg_importances = lg_feature_selector.get_importances(predicted, db.toxicity, as_df = True)
lg_importances.describe()

In [None]:
top_n_importances = lambda x, n: x.mean().sort_values(ascending = False).iloc[0:n]
def plot_n_importances(x, n):
    x = x.reindex(x.mean().sort_values(ascending=False).index, axis = 'columns')
    x = x.iloc[:,0:n]
    xrange = [x.mean().values.min()*.99, x.mean().values.max()*1.01]
    x.mean().plot.barh(**{'xerr': x.std().values/np.sqrt(n_samples), 'xlim': xrange})

In [None]:
plot_n_importances(lg_importances,20)

In [None]:
true_cluster_importances = cluster_importances = cluster_feature_selector.get_importances(true, db.toxicity, as_df = True)
true_cluster_importances.describe()

In [None]:
plot_n_importances(true_cluster_importances, 15)

In [None]:
true_lg_importances8 = lg_feature_selector.get_importances(true, db.toxicity, as_df = True)
true_lg_importances.describe()

In [None]:
plot_n_importances(true_lg_importances, 15)

In [None]:
feature_file = 'data/clustering_results/metaClusteringBootstrapped500MinmaxBest.csv'

In [11]:
#load in the selected features from the clustering notebook
selected_features = pd.read_csv(feature_file,index_col=0)
cluster_labels = selected_features.cluster_labels
selected_features = selected_features.drop('cluster_labels', axis = 1)
selected_features.head(5)

Unnamed: 0_level_0,Extended_Oral_Cavity_tsimdoses,Mandible_tsimdoses,t_volumes_0,Mandible_tumor_distances,Medial_Pterygoid_M_tsimdoses_combined,MPC_tumor_distances
Dummy.ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3,-0.539868,-0.456024,-0.582217,1.18507,-0.124849,0.611596
4,-0.83997,-0.679145,-0.71038,0.110453,-0.802765,0.400926
10,-0.046562,-0.332554,0.224269,0.427687,-0.157904,-1.019685
11,-1.256159,-1.164121,-0.926068,-1.155143,-1.394658,-1.334111
27,-0.600296,-0.757925,-0.751016,-1.155143,-0.962031,-1.334111


In [14]:
#get the relative loss of accuracy from dropping each of the variables in the selected features
scorer = FeatureClusterSelector(copy.copy(selection_clusterer))
scores = {f: [] for f in selected_features.columns}
scores['baseline'] = []
for n in range(n_samples):
    if n_samples > 1:
        xtemp, ytemp = resample(selected_features, db.toxicity)
    else:
        xtemp, ytemp = selected_features, db.toxicity
    base_score = scorer.bootstrap_score(xtemp, ytemp).mean()
    scores['baseline'].append(base_score)
    for feature in  selected_features.columns:
        xtemp = selected_features.drop(feature, axis = 1)
        new_score = scorer.bootstrap_score(xtemp, db.toxicity).mean()
        scores[feature].append(base_score-new_score)
scores = pd.DataFrame(scores)
scores.describe()

KeyboardInterrupt: 

In [None]:
#convert the scores (drop in clustering correlation) to an actual importance
fscores = scores.drop('baseline', axis = 1).apply(lambda x: 1/np.log(np.abs(1/x))*np.sign(x))
yerr = fscores.std()/np.sqrt(n_samples) if n_samples > 1 else np.zeros((fscores.shape[1],))
yrange = [fscores.mean().values.min()*.9, fscores.mean().values.max()*1.1]
(fscores.mean()).plot.bar(rot = 45, **{'ylim': yrange,'yerr': yerr})

In [None]:
pcounts = np.arange(1, int(selected_features.shape[0]/2))
n_subsamples = selected_features.shape[0]
sensitivity_report = np.zeros((n_subsamples, len(pcounts)))
feature_df = selected_features.copy()
feature_df['toxicity'] = db.toxicity
for p in pcounts:
    for n in range(n_subsamples):
        data_subset = feature_df.sample(n=int(selected_features.shape[0] - p),
                                          replace = False,
                                          random_state = n)
        tox_subset = data_subset['toxicity'].values
        clusters = selection_clusterer.fit_predict(data_subset.drop(['toxicity'],axis=1).values)
        sensitivity_report[n, p] = fisher_exact_test(tox_subset, clusters)
pd.DataFrame(sensitivity_report).describe()

In [34]:
import pickle
pickle.dump(db, open('data/patientSetAugmented.p', 'wb'))