In [28]:
from numpy.random import seed
seed(1)

from PatientSet import PatientSet
from Constants import Constants
import Metrics
from analysis import *
from Models import *
from copy import copy
import numpy as np
import pandas as pd

from time import time
from datetime import datetime

from dependencies.NCA import NeighborhoodComponentsAnalysis
from sklearn.base import BaseEstimator, ClassifierMixin
from scipy.special import softmax
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import OneHotEncoder, QuantileTransformer

from sklearn.naive_bayes import BernoulliNB, ComplementNB, GaussianNB, MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import VotingClassifier, ExtraTreesClassifier, RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier

from imblearn import under_sampling, over_sampling, combine

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)

In [25]:
class MetricLearningClassifier(BaseEstimator, ClassifierMixin):

    def __init__(self, n_components = 'auto',
                 random_state = 1,
                 resampler = None,
                 use_softmax = True):
        self.n_components = n_components
        if n_components is not 'auto':
            self.transformer = NeighborhoodComponentsAnalysis(n_components = n_components)
        self.group_parameters = namedtuple('group_parameters', ['means', 'inv_covariance', 'max_dist'])
        self.resampler = resampler
        self.use_softmax = use_softmax

    def get_optimal_components(self, x, y):
        n_components = x.shape[1]
        def get_score():
            nca = NeighborhoodComponentsAnalysis(n_components = n_components)
            nca.fit(x,y)
            return silhouette_score(nca.transform(x), y), nca
        score, nca = get_score()
        while True:
            if n_components <= 2:
                return nca
            n_components -= 1
            new_score, new_nca = get_score()
            if new_score > 1.1*score:
                score = new_score
                nca = new_nca
            else:
                return nca

    def fit(self, x, y):
        if self.n_components == 'auto':
            self.transformer = self.get_optimal_components(x, y)
        self.transformer.fit(x, y)
        self.groups = OrderedDict()
        if self.resampler is not None:
            xtemp, ytemp = self.resampler.fit_resample(x,y)
            if len(np.unique(ytemp)) == len(np.unique(y)):
                x = xtemp
                y = ytemp
        for group in np.unique(y):
            self.groups[group] = self.group_params(x, y, group)

    def group_params(self, x, y, group):
        targets = np.argwhere(y == group).ravel()
        x_target = self.transformer.transform(x[targets])
        fmeans = x_target.mean(axis = 0)
        inv_cov = np.linalg.pinv(np.cov(x_target.T))
        train_dists = self.mahalanobis_distances(x, self.group_parameters(fmeans, inv_cov, 0))
        parameters = self.group_parameters(fmeans, inv_cov, train_dists.max())
        return parameters

    def mahalanobis_distances(self, x, group):
        x_offset = self.transformer.transform(x) - group.means
        left_term = np.dot(x_offset, group.inv_covariance)
        mahalanobis = np.dot(left_term, x_offset.T).diagonal()
        return mahalanobis

    def predict_proba(self, x):
        all_distances = []
        for group_id, group_params in self.groups.items():
            distances = self.mahalanobis_distances(x, group_params)
            proximity = np.clip(1 - (distances/group_params.max_dist), 0.00001, 1)
            all_distances.append(proximity)
        output = np.hstack(all_distances).reshape(-1, len(self.groups.keys()))
        if self.use_softmax:
            output = softmax(output)
        else:
            output = output/output.sum(axis = 1).reshape(-1,1)
        return output

    def predict(self, x):
        labels = list(self.groups.keys())
        probs = self.predict_proba(self, x)
        max_probs =  np.argmax(probs, axis = 1).ravel()
        ypred = np.zeros(max_probs.shape).astype(np.dtype(labels[0]))
        for i in range(max_probs.shape[0]):
            ypred[i] = labels[max_probs[i]]
        return ypred[i]

    def fit_predict(self, x, y):
        self.fit(x,y)
        return self.predict(x)

class BayesWrapper(BaseEstimator, ClassifierMixin):

    def __init__(self, bayes = BernoulliNB(alpha = 0), n_categories = None):
        if n_categories is None:
            self.encoder = OneHotEncoder(categories = 'auto',
                                         sparse = False,
                                         handle_unknown = 'ignore')
        else:
            self.encoder = KBinsDiscretizer(n_bins = n_categories, encode = 'ordinal')
        self.bayes = bayes

    def fit(self, x, y):
        x = self.encoder.fit_transform(x)
        self.bayes.fit(x,y)
        return self

    def predict(self, x):
        xpred = self.encoder.transform(x)
        return self.bayes.predict(xpred)

    def predict_proba(self, x):
        xpred = self.encoder.transform(x)
        return self.bayes.predict_proba(xpred)

    def fit_predict(self, x, y):
        self.fit(x,y)
        return self.predict(x)

In [26]:
def discretize_continuous_fields(df, n_bins):
    encoder = KBinsDiscretizer(n_bins = n_bins, encode = 'ordinal')
    for col in df.columns:
        vals = df[col].values
        if len(np.unique(vals)) > n_bins:
            df[col] = encoder.fit_transform(vals.reshape(-1,1)).ravel()
    return df

def get_all_splits(df, regularizer, outcomes, resamplers = None):
    if resamplers is None:
        resamplers = [None,
                  under_sampling.RandomUnderSampler(),
                  over_sampling.RandomOverSampler(),
#                  under_sampling.InstanceHardnessThreshold(
#                          estimator = MetricLearningClassifier(),
#                          cv = 18),
                  under_sampling.InstanceHardnessThreshold(cv = 18),
                  over_sampling.SMOTE(),
                  combine.SMOTEENN(),
                  combine.SMOTETomek(),
                  under_sampling.InstanceHardnessThreshold(),
                  under_sampling.RepeatedEditedNearestNeighbours(),
                  under_sampling.EditedNearestNeighbours(),
                  under_sampling.CondensedNearestNeighbour(),
                  under_sampling.OneSidedSelection(),
                  ]
    data_splits = {}
    for outcome in outcomes:
        splits = {str(resampler): get_splits(df, outcome[0], regularizer, [resampler]) for resampler in resamplers}
        data_splits[outcome[1]] = splits
    return data_splits

def get_splits(df, y, regularizer = None, resamplers = None):
    x = df.values
    feature_labels = list(df.columns)
    loo = LeaveOneOut()
    splits = []
    for train, test in loo.split(x):
        split = {}
        xtrain, ytrain = x[train], y[train]
        xtest, ytest = x[test], y[test]
        if regularizer is not None:
            xtrain = regularizer.fit_transform(xtrain)
            xtest = regularizer.transform(xtest)
        for resampler in resamplers:
            if resampler is None:
                continue
            xtrain, ytrain = resampler.fit_resample(xtrain, ytrain)
        split['xtrain'] = xtrain
        split['xtest'] = xtest
        split['ytrain'] = ytrain
        split['ytest'] = ytest
        split['train_index'] = train
        split['test_index'] = test
        split['feature_labels'] = feature_labels
        splits.append(split)
    return splits

def cluster_features(db,
                  baseline_features = 'data/baselineClustering.csv',
                  use_baseline_features = True,
                  top_features = 'data/clustering_results/toxicityClustering.csv',
                  use_top_features = True,
                  discrete_features = False,
                  cluster_names = ['kmeans_k=4','cluster_labels']):
    baseline = pd.read_csv(baseline_features, index_col = 'Dummy.ID').drop('Unnamed: 0', axis = 1)
    all_clusters = set(['manhattan_k=2','manhattan_k=3','manhattan_k=4',
                        'cluster_labels','hc_ward2','hc_ward4',
                        'FT','AR','TOX'])
    non_features = list(all_clusters - set(cluster_names))
    
    if use_baseline_features:
        cluster_names = cluster_names + list(baseline.drop(non_features, axis = 1, errors='ignore').columns)
    if 'T.category' in cluster_names:
        dist_clusters['T.category'] = dist_clusters['T.category'].apply(lambda x: int(x[1]))
        
    if isinstance(top_features, str):
        dist_clusters = pd.read_csv(top_features, index_col = 0)
        dist_clusters.index.rename('Dummy.ID', inplace = True)
        if use_top_features:
            cluster_names = cluster_names + list( dist_clusters.drop(non_features,axis=1, errors='ignore').columns)
        df = baseline.merge(dist_clusters, on=['Dummy.ID'])
    else:
        df = baseline
    ft = df.FT.values
    ar = df.AR.values
    tox = df.TOX.values
    to_drop = set(df.columns) - set(cluster_names)
    df = df.drop(to_drop, axis = 1, errors = 'ignore')
    if discrete_features:
        df = discretize_continuous_fields(df, 5)
    columns = df.columns
    for col in columns:
        if col in all_clusters:
            groups = set(df[col].values)
            for g in groups:
                col_name = col + '=' + str(g)
                df[col_name] = df[col].values == g
            df = df.drop(col, axis = 1)
    print(df.columns)
    return df, ft, ar, tox

def test_classifiers(classifiers, 
                     db = None, 
                     log = False,
                     feature_params = {},
                     regularizer = QuantileTransformer(),
                     data_splits = None,
                     print_importances = False):

    result_template = {'cluster_names': copy(str(feature_params['cluster_names'])),
                       'Baseline': str(feature_params['use_baseline_features']),
                       'Top_features': str(feature_params['use_top_features'])}

    if log:
        timestamp = datetime.fromtimestamp(time()).strftime('%Y_%m_%d_%H%M%S')
        f = open(Constants.toxicity_log_file_root + timestamp +'.txt', 'w', buffering = 1)
        def write(string):
            print(string)
            f.write(str(string)+'\n')
    else:
        write = lambda string: print(string)
        
    df, ft, ar, tox = cluster_features(db, **feature_params)

    write('features: ' + ', '.join([str(c) for c in df.columns]) + '\n')
    outcomes = [(ft, 'feeding_tube'), (ar, 'aspiration')]
    data_splits = get_all_splits(df, regularizer, outcomes) if data_splits is None else data_splits
    print('splits finished')
    results = []
    for classifier in classifiers:
        write(classifier)
        for outcome in outcomes:
            data_split = data_splits[outcome[1]]
            for resampler_name, splits in data_split.items():
                try:
                    write(resampler_name)
                    auc, importances = presplit_roc_cv(classifier, splits)
                    write(outcome[1])
                    write(auc)
                    if importances is not None:
                        write(importances)
                    write('\n')
                    result = copy(result_template)
                    result['classifier'] = str(classifier)
                    result['outcome'] = str(outcome[1])
                    result['resampler'] = str(resampler_name)
                    result['AUC'] = auc
                    results.append(result)
                except:
                    continue
    if log:
        f.close()
    return results

In [5]:
def augmented_db(db = None, db_args = {}):
    if db is None:
        db = PatientSet(**db_args)
    db.discrete_dists = Metrics.discretize(-db.tumor_distances, n_bins = 15, strategy='uniform')
    db.t_volumes = np.array([np.sum([g.volume for g in gtvs]) for gtvs in db.gtvs]).reshape(-1,1)
    db.bilateral = db.lateralities == 'B'
    db.pdoses = default_rt_prediction(db)
    db.t4 = db.t_categories == 'T4'
    db.m_volumes = db.volumes[:, [Constants.organ_list.index('Rt_Masseter_M'), Constants.organ_list.index('Lt_Masseter_M')]].sum(axis = 1).ravel()
    return(db)

db = augmented_db()

  mean_tumor_distances /= tumor_volume
  tumor_position /= tumor_volume


100 [{0, 1}, {2}]
128 [{0}, {1, 2, 3, 4}]
notation not accounted for in lymph nodes: R3/R4
notation not accounted for in lymph nodes: L2/3
notation not accounted for in lymph nodes: R3/4
notation not accounted for in lymph nodes: R2/3
notation not accounted for in lymph nodes: R2-R4
notation not accounted for in lymph nodes: L2/3
notation not accounted for in lymph nodes: L2/3
notation not accounted for in lymph nodes: L2/3
notation not accounted for in lymph nodes: R2/3
notation not accounted for in lymph nodes: R2/3/4
notation not accounted for in lymph nodes: L2/3
notation not accounted for in lymph nodes: L2/3
notation not accounted for in lymph nodes: R2/3
notation not accounted for in lymph nodes: R2/3
notation not accounted for in lymph nodes: R2/3
10021 [{0, 1}, {2}]
10074 [{0, 1}, {2}]
error reading tumor volume for  10091
error reading tumor volume for  10148
10191 [{0, 1}, {2}]
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:

In [None]:
classifiers = [
#                    DecisionTreeClassifier(),
#                    DecisionTreeClassifier(criterion='entropy'),
#                    XGBClassifier(1, booster = 'gblinear'),
#                    XGBClassifier(3, booster = 'gblinear'),
#                    XGBClassifier(5, booster = 'gblinear'),
#                    XGBClassifier(),
#                    XGBClassifier(booster = 'dart'),
                    LogisticRegression(C = 1, solver = 'lbfgs', max_iter = 3000),
#                    MetricLearningClassifier(use_softmax = True),
#                    MetricLearningClassifier(
#                            resampler = under_sampling.OneSidedSelection()),
#                    MetricLearningClassifier(
#                            resampler = under_sampling.CondensedNearestNeighbour()),
#                    ExtraTreesClassifier(n_estimators = 200),
#                    RandomForestClassifier(n_estimators = 200, max_depth = 3),
#                    BayesWrapper(),
                   ]
feature_params = {
    'use_baseline_features': True,
    'top_features': None,
    'use_top_features': False,       
    'discrete_features': False,
    'cluster_names': ['kmeans_k=4','cluster_labels']
}
cluster_root= 'data/clustering_results/'
all_results = []

run = lambda x: test_classifiers(classifiers, db, log = True, feature_params = x)
do_test = lambda: all_results.extend(run(feature_params))
    
for cluster_combo in [[],['manhattan_k=4'],['cluster_labels'],['manhattan_k=4','cluster_labels']]:
    feature_params['cluster_names'] = cluster_combo
    do_test()

#test out extracted Features
feature_params['use_top_features'] = True
feature_params['cluster_names'] = ['cluster_labels']
for feature_file in ['toxicity', 'aspiration', 'feedingTube']:
    feature_params['top_features'] = cluster_root + feature_file + '.csv'
    do_test()
    
feature_params['use_baseline_features'] = False
for feature_file in ['toxicity', 'aspiration', 'feedingTube']:
    feature_params['top_features'] = cluster_root + feature_file + '.csv'
    do_test()
    
df = pd.DataFrame(all_results).sort_values(
        ['classifier',
         'outcome',
         'AUC',
         'resampler',
         'cluster_names',
         'Baseline'],
         kind = 'mergesort',
         ascending = False)
df.to_csv('data/toxcity_classification_tests_'
          + datetime.fromtimestamp(time()).strftime('%Y_%m_%d_%H%M%S')
          + '.csv', index = False)

Index(['Age.at.Diagnosis..Calculated.', 'Total.dose', 'Pathological.Grade_II',
       'Pathological.Grade_III', 'Pathological.Grade_IV',
       'Pathological.Grade_I', 'Therapeutic.combination_CC',
       'Therapeutic.combination_Radiation.alone',
       'Therapeutic.combination_IC.CC',
       'Therapeutic.combination_IC.Radiation.alone', 'Tm.Laterality..R.L._L',
       'Tm.Laterality..R.L._R', 'HPV.P16.status_Positive',
       'HPV.P16.status_Negative', 'T.category_T2', 'T.category_T1',
       'T.category_T3', 'T.category_T4', 'N.category_8th_edition_N1',
       'N.category_8th_edition_N2', 'N.category_8th_edition_N3',
       'N.category_8th_edition_N0', 'AJCC.8th.edition_I',
       'AJCC.8th.edition_II', 'AJCC.8th.edition_IV', 'AJCC.8th.edition_III',
       'Smoking.status.at.Diagnosis..Never.Former.Current._Never',
       'Smoking.status.at.Diagnosis..Never.Former.Current._Current',
       'Smoking.status.at.Diagnosis..Never.Former.Current._Formar',
       'Tumor.subsite_Tonsil', 'T