In [1]:
from numpy.random import seed
seed(1)

from PatientSet import *
from Constants import Constants
import Metrics
from analysis import *
from copy import copy
import numpy as np
import pandas as pd

from time import time
from datetime import datetime

from Classifiers import *
from sklearn.preprocessing import OneHotEncoder, QuantileTransformer
from sklearn.naive_bayes import BernoulliNB, ComplementNB, GaussianNB, MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import VotingClassifier, ExtraTreesClassifier, RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier

from imblearn import under_sampling, over_sampling, combine

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)

baseline_feature_file = Constants.baseline_feature_file




Using TensorFlow backend.


In [2]:
def presplit_roc_cv(classifier, data_split):
    ypred = np.zeros((len(data_split),))
    y = np.array([split['ytest'] for split in data_split])
    i = 0
    for split in data_split:
        classifier.fit(split['xtrain'], split['ytrain'])
        ypred[i] = classifier.predict_proba(split['xtest'])[0,1]
        if i == 0:
            has_importances = hasattr(classifier, 'feature_importances_')
        if has_importances:
            if i == 0:
                importances = classifier.feature_importances_
            else:
                importances += classifier.feature_importances_
        i += 1
    if has_importances:
        importances /= i
        importances = pd.Series(data = importances, index = data_split[0]['feature_labels'])
    else:
        importances = None
    return roc_auc_score(y, ypred), importances

def discretize_continuous_fields(df, n_bins):
    encoder = KBinsDiscretizer(n_bins = n_bins, encode = 'ordinal')
    for col in df.columns:
        vals = df[col].values
        if len(np.unique(vals)) > n_bins:
            df[col] = encoder.fit_transform(vals.reshape(-1,1)).ravel()
    return df

def get_all_splits(df, regularizer, outcomes):
    data_splits = {}
    for outcome in outcomes:
        splits = {str(resampler): get_splits(df, outcome[0], regularizer, [resampler]) for resampler in resamplers}
        data_splits[outcome[1]] = splits
    return data_splits

def get_splits(df, y, regularizer = None, resamplers = None):
    x = df.values
    feature_labels = list(df.columns)
    loo = LeaveOneOut()
    splits = []
    for train, test in loo.split(x):
        split = {}
        xtrain, ytrain = x[train], y[train]
        xtest, ytest = x[test], y[test]
        if regularizer is not None:
            xtrain = regularizer.fit_transform(xtrain)
            xtest = regularizer.transform(xtest)
        for resampler in resamplers:
            if resampler is None:
                continue
            xtrain, ytrain = resampler.fit_resample(xtrain, ytrain)
        split['xtrain'] = xtrain
        split['xtest'] = xtest
        split['ytrain'] = ytrain
        split['ytest'] = ytest
        split['train_index'] = train
        split['test_index'] = test
        split['feature_labels'] = feature_labels
        splits.append(split)
    return splits

def cluster_features(db,
                  baseline_features = baseline_feature_file,
                  use_baseline_features = True,
                  top_features = 'data/clustering_results/toxicityClustering.csv',
                  use_top_features = True,
                  discrete_features = False,
                  cluster_names = ['kmeans_k=4','cluster_labels']):
    baseline = pd.read_csv(baseline_features, index_col = 'Dummy.ID').drop('Unnamed: 0', axis = 1)
    all_clusters = set(['manhattan_k=2','manhattan_k=3','manhattan_k=4',
                        'cluster_labels','hc_ward2','hc_ward4',
                        'FT','AR','TOX'])
    non_features = list(all_clusters - set(cluster_names))
    
    if use_baseline_features:
        cluster_names = cluster_names + list(baseline.drop(non_features, axis = 1, errors='ignore').columns)
    if 'T.category' in cluster_names:
        dist_clusters['T.category'] = dist_clusters['T.category'].apply(lambda x: int(x[1]))
        
    if isinstance(top_features, str):
        dist_clusters = pd.read_csv(top_features, index_col = 0)
        dist_clusters.index.rename('Dummy.ID', inplace = True)
        if use_top_features:
            cluster_names = cluster_names + list( dist_clusters.drop(non_features,axis=1, errors='ignore').columns)
        df = baseline.merge(dist_clusters, on=['Dummy.ID'])
    else:
        df = baseline
    ft = df.FT.values
    ar = df.AR.values
    tox = df.TOX.values
    to_drop = set(df.columns) - set(cluster_names)
    df = df.drop(to_drop, axis = 1, errors = 'ignore')
    if discrete_features:
        df = discretize_continuous_fields(df, 5)
    columns = df.columns
    for col in columns:
        if col in all_clusters:
            groups = set(df[col].values)
            for g in groups:
                col_name = col + '=' + str(g)
                df[col_name] = df[col].values == g
            df = df.drop(col, axis = 1)
    return df, ft, ar, tox

def test_classifiers(classifiers, 
                     db = None, 
                     log = False,
                     feature_params = {},
                     regularizer = QuantileTransformer(),
                     data_splits = None,
                     print_importances = False,
                    additional_features = None):

    result_template = {'cluster_names': copy(str(feature_params['cluster_names'])),
                       'Baseline': str(feature_params['use_baseline_features']),
                       'Top_features': str(feature_params['use_top_features']),
                       'Top_feature_file': str(feature_params['top_features']),
                      }

    if log:
        timestamp = datetime.fromtimestamp(time()).strftime('%Y_%m_%d_%H%M%S')
        f = open(Constants.toxicity_log_file_root + timestamp +'.txt', 'w', buffering = 1)
        def write(string):
            print(string)
            f.write(str(string)+'\n')
    else:
        write = lambda string: print(string)
        
    df, ft, ar, tox = cluster_features(db, **feature_params)
    if additional_features is not None:
        #should be tuple of attributes, organ_list (default none) to pass to patientset.to_dataframe
        df = db.to_dataframe(additional_features[0], df, additional_features[1])
    write(str(feature_params))
    outcomes = [(ft, 'feeding_tube'), (ar, 'aspiration'), (tox, 'toxicity')]
    data_splits = get_all_splits(df, regularizer, outcomes) if data_splits is None else data_splits
    print('splits finished')
    results = []
    for classifier in classifiers:
        write(classifier)
        for outcome in outcomes:
            data_split = data_splits[outcome[1]]
            for resampler_name, splits in data_split.items():
                try:
                    write(resampler_name)
                    auc, importances = presplit_roc_cv(classifier, splits)
                    write(outcome[1])
                    write(auc)
                    if importances is not None and print_importances:
                        write(importances)
                    write('\n')
                    result = copy(result_template)
                    result['classifier'] = str(classifier)
                    result['outcome'] = str(outcome[1])
                    result['resampler'] = str(resampler_name)
                    result['AUC'] = auc
                    results.append(result)
                except Exception as e:
                    print(e)
    if log:
        f.close()
    return results

In [3]:
db = load_patientset()

In [20]:
def augmented_db(db = None, db_args = {}):
    if db is None:
        db = PatientSet(**db_args)
    db.toxicity = db.feeding_tubes + db.aspiration > 0
    return(db)
classifiers = [
#                    DecisionTreeClassifier(),
#                    DecisionTreeClassifier(criterion='entropy'),
#                    XGBClassifier(1, booster = 'gblinear'),
#                    XGBClassifier(3, booster = 'gblinear'),
#                    XGBClassifier(5, booster = 'gblinear'),
#                    XGBClassifier(),
#                    XGBClassifier(booster = 'dart'),
                    LogisticRegression(solver = 'lbfgs', max_iter = 8000),
#                    MetricLearningClassifier(use_softmax = True),
#                    MetricLearningClassifier(
#                            resampler = under_sampling.OneSidedSelection()),
#                    MetricLearningClassifier(
#                            resampler = under_sampling.CondensedNearestNeighbour()),
#                    ExtraTreesClassifier(n_estimators = 200),
#                    RandomForestClassifier(n_estimators = 200, max_depth = 3),
#                    BayesWrapper(),
                   ]

resamplers = [
                  None,
#                  under_sampling.RandomUnderSampler(),
#                  over_sampling.RandomOverSampler(),
#                  under_sampling.InstanceHardnessThreshold(
#                          estimator = MetricLearningClassifier(),
#                          cv = 18),
#                  under_sampling.InstanceHardnessThreshold(cv = 18),
#                  over_sampling.SMOTE(),
#                  combine.SMOTEENN(),
#                  combine.SMOTETomek(),
#                  under_sampling.InstanceHardnessThreshold(),
#                  under_sampling.RepeatedEditedNearestNeighbours(),
#                  under_sampling.EditedNearestNeighbours(),
#                  under_sampling.CondensedNearestNeighbour(),
#                  under_sampling.OneSidedSelection(),
                  ]

cluster_root= 'data/clustering_results/'
feature_file_names = ['kmeans4_3features']
#llop through all files of feature extracted from toxicity clustering
all_results = []
additional_features = None

run = lambda x: test_classifiers(classifiers, db, 
                                 log = True, 
                                 feature_params = x, 
                                 additional_features = additional_features)
do_test = lambda: all_results.extend(run(feature_params))

def try_features():
    for feature_file in feature_file_names:
        feature_params['top_features'] = cluster_root + feature_file + '.csv'
        do_test()

In [21]:
#the the unsupervised AUC scores
def print_cluster_results(clusters, name):
    print(name)
    for outcome in ['feeding_tubes', 'aspiration','toxicity']:
        print(outcome)
        tox = getattr(db, outcome)
        pred_outcome = BayesWrapper().fit_predict(clusters, tox.reshape(-1,1), True)[:,1]
        print(roc_auc_score(tox, pred_outcome))
    print()
    
baseline_clusters = pd.read_csv(baseline_feature_file, index_col='Dummy.ID')['manhattan_k=4'].values.reshape(-1,1)
print_cluster_results(baseline_clusters, 'baseline')

for feature_file in feature_file_names:
    file = cluster_root + feature_file + '.csv'
    spatial_clusters = pd.read_csv(file).cluster_labels.values.reshape(-1,1)
    print_cluster_results(spatial_clusters, feature_file)
    both_clusters = np.hstack([spatial_clusters, baseline_clusters])
    print_cluster_results(both_clusters, feature_file + '+baseline')

baseline
feeding_tubes
0.7258682328907048
aspiration
0.7525443442861297
toxicity
0.7402551381998583

kmeans4_3features
feeding_tubes
0.7779622063329928
aspiration
0.8130270427449839
toxicity
0.8195428773919206

kmeans4_3features+baseline
feeding_tubes
0.7925178753830439
aspiration
0.8471939517301541
toxicity
0.8327427356484762



In [22]:
#try out combinations of just clusters
feature_params = {
    'use_baseline_features': False,
    'top_features': None,
    'use_top_features': False,       
    'discrete_features': False,
    'cluster_names': ['manhattan_k=4']
}
do_test()
for cluster_combo in [['manhattan_k=4', 'cluster_labels'],['cluster_labels']]:
    feature_params['cluster_names'] = cluster_combo
    try_features()

{'use_baseline_features': False, 'top_features': None, 'use_top_features': False, 'discrete_features': False, 'cluster_names': ['manhattan_k=4']}
splits finished
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=8000,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
None
feeding_tube
0.586567926455567


None
aspiration
0.49723756906077343


None
toxicity
0.6031183557760453


{'use_baseline_features': False, 'top_features': 'data/clustering_results/kmeans4_3features.csv', 'use_top_features': False, 'discrete_features': False, 'cluster_names': ['manhattan_k=4', 'cluster_labels']}
splits finished
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=8000,
                   multi_class

In [23]:
#baseline test
feature_params['use_baseline_features'] = True
for cluster_combo in [[],['manhattan_k=4']]:
    feature_params['cluster_names'] = cluster_combo
    do_test()

{'use_baseline_features': True, 'top_features': 'data/clustering_results/kmeans4_3features.csv', 'use_top_features': False, 'discrete_features': False, 'cluster_names': []}
splits finished
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=8000,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
None
feeding_tube
0.6404494382022471


None
aspiration
0.8493748182611224


None
toxicity
0.7888022678951098


{'use_baseline_features': True, 'top_features': 'data/clustering_results/kmeans4_3features.csv', 'use_top_features': False, 'discrete_features': False, 'cluster_names': ['manhattan_k=4']}
splits finished
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=8000,
                   mul

In [24]:
#test out adding clusters from each type (baseline + clusters)
feature_params['use_baseline_features'] = True
feature_params['use_top_features'] = False
feature_params['cluster_names'] = ['manhattan_k=4', 'cluster_labels']
try_features()

feature_params['cluster_names'] = ['cluster_labels']
try_features()

{'use_baseline_features': True, 'top_features': 'data/clustering_results/kmeans4_3features.csv', 'use_top_features': False, 'discrete_features': False, 'cluster_names': ['manhattan_k=4', 'cluster_labels']}
splits finished
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=8000,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
None
feeding_tube
0.6800306435137896


None
aspiration
0.848502471648735


None
toxicity
0.8139617292700213


{'use_baseline_features': True, 'top_features': 'data/clustering_results/kmeans4_3features.csv', 'use_top_features': False, 'discrete_features': False, 'cluster_names': ['cluster_labels']}
splits finished
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_

In [25]:
#now try just spatial features
feature_params['use_top_features'] = True
feature_params['use_baseline_features'] = False
feature_params['cluster_names']= ['cluster_labels']
try_features()

feature_params['use_baseline_features'] = True
try_features()

{'use_baseline_features': False, 'top_features': 'data/clustering_results/kmeans4_3features.csv', 'use_top_features': True, 'discrete_features': False, 'cluster_names': ['cluster_labels']}
splits finished
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=8000,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
None
feeding_tube
0.6943309499489274


None
aspiration
0.7234661238732188


None
toxicity
0.7501771793054571


{'use_baseline_features': True, 'top_features': 'data/clustering_results/kmeans4_3features.csv', 'use_top_features': True, 'discrete_features': False, 'cluster_names': ['cluster_labels']}
splits finished
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=8000,
      

In [26]:
#save it all
df = pd.DataFrame(all_results).sort_values(
        ['classifier',
         'outcome',
         'AUC',
         'resampler',
         'cluster_names',
         'Baseline'],
         kind = 'mergesort',
         ascending = False)
df.to_csv('data/toxcity_classification_tests_'
          + datetime.fromtimestamp(time()).strftime('%Y_%m_%d_%H%M%S')
          + '.csv', index = False)