In [18]:
#automatically reload stuff
%reload_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
from Constants import Const
import json
import Utils
import re
from sklearn.mixture import GaussianMixture, BayesianGaussianMixture
from sklearn.cluster import SpectralClustering, KMeans, AgglomerativeClustering
from sklearn.decomposition import PCA
from scipy.stats import chi2
from ast import literal_eval
import statsmodels.api as sm
import Metrics
import simplejson

import warnings
from statsmodels.tools.sm_exceptions import ConvergenceWarning, HessianInversionWarning
warnings.simplefilter('ignore', ConvergenceWarning)
warnings.simplefilter('ignore', HessianInversionWarning)

In [19]:

def nested_responsify(dictionary):
    if isinstance(dictionary,list) or isinstance(dictionary,set):
        vals = [nested_responsify(x) for i,x in enumerate(dictionary)]
        return vals
    new_dict= {}
    for k,v in dictionary.items():
        if isinstance(v,dict):

            new_dict[k] = nested_responsify(v)
        else:
            new_dict[k] = v
    djson = simplejson.dumps(new_dict,default=np_converter,ignore_nan=True)
    return djson.replace('\\"','\"')

In [20]:
def add_sd_dose_clusters(sddf, 
                         clusterer = None,
                         features=None,
                         reducer=None,
                         organ_subset=None,
                         normalize = True,
                         prefix='',
                         n_clusters = 4,
                        ):
    if clusterer is None:
        clusterer = BayesianGaussianMixture(n_init=5,
                                            n_components=n_clusters, 
                                            covariance_type="full",
                                            random_state=100)
    if features is None:
        features=['V35','V40','V45','V50','V55','V60','V65']
    if reducer is None:
        reducer= None#PCA(len(organ_list),whiten=True)
    if organ_subset is None:
        organ_subset = Const.organ_list[:]
    organ_positions = [Const.organ_list.index(o) for o in organ_subset]
    vals = np.stack(sddf[features].apply(lambda x: np.stack([np.array([ii[i] for i in organ_positions]).astype(float) for ii in x]).ravel(),axis=1).values)
    if normalize:
        vals = (vals - vals.mean(axis=0))/(vals.std(axis=0) + .01)
    if reducer is not None:
        vals = reducer.fit_transform(vals)
    df = pd.DataFrame(vals,index = sddf.index)
    clusters = clusterer.fit_predict(vals)
    new_df = sddf.copy()
    cname= prefix+'dose_clusters'
    new_df[cname] = clusters
    new_df = reorder_clusters(new_df,
                              cname,
                              by='mean_dose',
                              organ_list=organ_subset#order by mean dose to clustered organs
                             )
    return new_df

def reorder_clusters(df,cname,by='moderate_6wk_symptoms',organ_list=None):
    df = df.copy()
    df2 = df.copy()
    severities = {}
    clusts = sorted(df[cname].unique())
    getmean = lambda d: d[by].astype(float).mean()
    if organ_list is not None and Utils.iterable(df[by].iloc[0]):
        keep_idx = [Const.organ_list.index(o) for o in organ_list]
        df[by] = df[by].apply(lambda x: [x[i] for i in keep_idx])
    if Utils.iterable(df[by].iloc[0]):
        getmean = lambda d: np.stack(d[by].apply(lambda x: np.array(x).sum()).values).mean()
    for c in clusts:
        subset = df[df[cname] == c]
        avg_severity = getmean(subset)
        severities[c] = avg_severity
    clust_order = np.argsort(sorted(severities.keys(), key = lambda x: severities[x]))
    clust_map = {c: clust_order[i] for i,c in enumerate(clusts)}
    df2[cname] = df[cname].apply(lambda x: clust_map.get(x))
    return df2

def get_df_dose_cols(df,key='DV'):
    return [c for c in df.columns if re.match('[' + key + ']\d+',c) is not None]

def get_df_symptom_cols(df):
    return [c for c in df.columns if 'symptoms_' in c if 'original' not in c]
    
def load_dose_symptom_data():
    data = pd.read_csv(Const.data_dir + 'dose_symptoms_merged.csv')
    to_drop = [c for c in data.columns if 'symptom' in c and ('symptoms_' not in c or 'original' in c)]
    data = data.drop(to_drop,axis=1)
    dose_cols = get_df_dose_cols(data)
    s_cols = get_df_symptom_cols(data) 
    for c in dose_cols + s_cols + ['mean_dose','volume','dates']:
        try:
            data[c] = data[c].apply(literal_eval)
        except Exception as e:
            print(c,e)
    return data

data = load_dose_symptom_data()
add_sd_dose_clusters(data)

Unnamed: 0,id,D10,D15,D2,D20,D25,D30,D35,D40,D45,...,symptoms_sleep,symptoms_sob,symptoms_swallow,symptoms_taste,symptoms_teeth,symptoms_voice,symptoms_vomit,symptoms_walking,symptoms_work,dose_clusters
0,7,"[37.625, 37.15625, 60.3125, 66.3125, 49.4375, ...","[36.5625, 36.75, 59.96875, 64.0625, 48.375, 49...","[39.96875, 38.0625, 60.90625, 71.8125, 52.1562...","[35.25, 36.34375, 59.71875, 62.71875, 47.34375...","[34.0625, 35.90625, 59.40625, 61.53125, 46.562...","[33.125, 35.5, 59.1875, 60.34375, 45.6875, 43....","[32.28125, 35.03125, 58.90625, 59.03125, 44.75...","[31.609375, 34.625, 58.625, 57.71875, 43.96875...","[30.84375, 34.1875, 58.3125, 56.40625, 43.1875...",...,"[0.0, 2.0, 5.0, 2.0, 1.0, 2.0, 2.0, 2.0, 1.0, ...","[3.0, 1.0, 2.0, 1.0, 2.0, 0.0, 3.0, 0.0, 1.0, ...","[0.0, 0.0, 1.0, 4.0, 6.0, 3.0, 8.0, 3.0, 2.0, ...","[5.0, 3.0, 3.0, 5.0, 8.0, 6.0, 7.0, 6.0, 6.0, ...","[2.0, 0.0, 0.0, 3.0, 3.0, 0.0, 0.0, 0.0, 1.0, ...","[0.0, 1.0, 1.0, 1.0, 2.0, 2.0, 4.0, 2.0, 1.0, ...","[0.0, 0.0, 0.0, 1.0, 5.0, 0.0, 2.0, 2.0, 0.0, ...","[0.0, 0.0, 0.0, 2.0, 1.0, 1.0, 1.0, 0.0, 1.0, ...","[0.0, 0.0, 0.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, ...",2
1,8,"[32.15625, 32.0625, 56.6875, 65.375, 22.5, 31....","[31.703125, 31.109375, 55.75, 63.34375, 21.312...","[33.4375, 34.09375, 58.28125, 68.75, 33.6875, ...","[31.203125, 30.25, 54.5625, 61.34375, 17.84375...","[30.609375, 29.40625, 52.375, 60.03125, 14.367...","[29.9375, 28.390625, 50.5625, 59.5, 13.140625,...","[28.96875, 27.453125, 49.46875, 58.96875, 11.7...","[27.1875, 26.46875, 48.59375, 58.53125, 11.203...","[24.875, 25.453125, 48.0, 58.125, 10.9609375, ...",...,"[2.0, 2.0, 2.0, 2.0, 1.0, 0.0, 3.0, 3.0, 3.0, ...","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","[1.0, 6.0, 2.0, 2.0, 3.0, 2.0, 1.0, 4.0, 4.0, ...","[1.0, 2.0, 1.0, 2.0, 4.0, 3.0, 2.0, 4.0, 4.0, ...","[0.0, 0.0, 0.0, 2.0, 1.0, 0.0, 2.0, 3.0, 0.0, ...","[0.0, 2.0, 0.0, 0.0, 1.0, 0.0, 0.0, 2.0, 0.0, ...","[0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 2.0, 3.0, 0.0, ...","[0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, ...","[1.0, 1.0, 2.0, 0.0, 1.0, 1.0, 1.0, 3.0, 3.0, ...",2
2,9,"[51.4375, 27.515625, 51.4375, 51.4375, 18.6875...","[48.59375, 26.671875, 48.59375, 48.59375, 15.0...","[56.65625, 29.5625, 56.65625, 56.65625, 27.312...","[46.03125, 25.9375, 46.03125, 46.03125, 13.0, ...","[43.53125, 25.296875, 43.53125, 43.53125, 11.3...","[41.28125, 24.671875, 41.28125, 41.28125, 10.3...","[39.15625, 24.046875, 39.15625, 39.15625, 9.82...","[37.0, 23.359375, 37.0, 37.0, 9.4609375, 11.28...","[35.125, 22.625, 35.125, 35.125, 9.109375, 9.3...",...,"[0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 5.0, 5.0, 1.0, ...","[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","[0.0, 2.0, 2.0, 1.0, 2.0, 2.0, 3.0, 3.0, 3.0, ...","[0.0, 2.0, 2.0, 5.0, 6.0, 5.0, 5.0, 9.0, 4.0, ...","[0.0, 0.0, 0.0, 2.0, 1.0, 3.0, 2.0, 2.0, 2.0, ...","[0.0, 2.0, 1.0, 2.0, 1.0, 1.0, 2.0, 2.0, 2.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 1.0, ...","[0.0, 0.0, 0.0, 0.0, 3.0, 3.0, 3.0, 2.0, 1.0, ...",1
3,10,"[44.21875, 35.625, 64.75, 55.9375, 52.84375, 5...","[43.1875, 34.28125, 62.6875, 54.3125, 50.78125...","[47.84375, 39.21875, 68.5625, 58.15625, 57.406...","[42.0, 33.125, 59.53125, 52.84375, 48.875, 46....","[40.71875, 32.34375, 58.65625, 51.4375, 47.343...","[38.90625, 31.625, 58.3125, 50.0, 46.5, 44.687...","[36.875, 30.921875, 57.8125, 48.75, 46.125, 43...","[35.5625, 30.1875, 56.6875, 47.6875, 45.75, 43...","[33.96875, 29.4375, 55.03125, 46.4375, 45.3125...",...,"[3.0, 0.0, 0.0, 2.0, 1.0, 0.0, 3.0, 3.0, 0.0, ...","[0.0, 3.0, 1.0, 3.0, 2.0, 2.0, 3.0, 4.0, 0.0, ...","[0.0, 0.0, 0.0, 3.0, 3.0, 4.0, 6.0, 5.0, 1.0, ...","[0.0, 0.0, 0.0, 2.0, 6.0, 7.0, 7.0, 0.0, 2.0, ...","[1.0, 0.0, 0.0, 2.0, 2.0, 3.0, 2.0, 6.0, 1.0, ...","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 4.0, 3.0, 0.0, ...","[0.0, 0.0, 0.0, 2.0, 1.0, 0.0, 3.0, 2.0, 0.0, ...","[0.0, 0.0, 1.0, 2.0, 1.0, 2.0, 4.0, 4.0, 0.0, ...",3
4,11,"[26.15625, 31.984375, 45.40625, 57.125, 29.265...","[25.625, 31.359375, 45.15625, 56.875, 26.48437...","[27.5625, 33.25, 45.84375, 57.65625, 36.75, 44...","[25.09375, 30.71875, 45.0, 56.65625, 22.828125...","[22.625, 30.15625, 44.8125, 56.46875, 19.85937...","[19.75, 29.65625, 44.625, 56.21875, 17.640625,...","[14.1484375, 29.234375, 44.46875, 55.9375, 15....","[11.09375, 28.84375, 44.21875, 55.6875, 14.210...","[9.1015625, 28.46875, 44.03125, 55.4375, 12.53...",...,"[0.0, 1.0, 2.0, 4.0, 1.0, 2.0, 2.0, 3.0, 1.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 3.0, 2.0, 4.0, 0.0, 2.0, 1.0, ...","[0.0, 0.0, 1.0, 4.0, 7.0, 5.0, 3.0, 5.0, 3.0, ...","[0.0, 0.0, 0.0, 2.0, 0.0, 1.0, 1.0, 1.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 2.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 3.0, 0.0, ...","[0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
344,807,"[35.0625, 38.71875, 57.15625, 55.03125, 38.5, ...","[34.53125, 38.28125, 55.5, 54.46875, 37.0625, ...","[38.125, 39.65625, 58.28125, 56.21875, 43.4062...","[33.90625, 37.9375, 53.21875, 54.09375, 36.156...","[33.34375, 37.5625, 51.5, 53.6875, 35.5, 30.23...","[32.71875, 37.25, 50.0625, 53.28125, 34.875, 2...","[31.75, 36.96875, 48.71875, 52.90625, 34.375, ...","[30.84375, 36.6875, 47.3125, 52.4375, 33.8125,...","[30.078125, 36.375, 45.59375, 51.9375, 33.4062...",...,"[0.0, 0.0, 0.0, 2.0, 1.0, 2.0, 4.0, 4.0, 4.0, ...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 1.0, 0.0, 0.0, 2.0, 0.0, 2.0, 1.0, 2.0, ...","[0.0, 0.0, 1.0, 3.0, 7.0, 8.0, 10.0, 10.0, 6.0...","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.0, ...",2
345,809,"[1.580078125, 6.76171875, 2.419921875, 17.4375...","[1.1796875, 6.51171875, 2.23046875, 16.65625, ...","[2.6796875, 7.2890625, 3.48046875, 18.40625, 5...","[0.89013671875, 6.23828125, 2.099609375, 15.71...","[0.6201171875, 5.9609375, 2.01953125, 14.64843...","[0.5, 5.640625, 1.9599609375, 13.453125, 4.371...","[0.39990234375, 5.3515625, 1.900390625, 12.210...","[0.330078125, 5.08984375, 1.849609375, 10.8906...","[0.280029296875, 4.73046875, 1.7900390625, 9.6...",...,"[0.0, 1.0, 1.0, 2.0, 3.0, 2.0, 1.0, 0.0, 1.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 1.0, 3.0, 3.0, 2.0, 2.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 2.0, 8.0, 8.0, 6.0, 3.0, ...","[0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0
346,811,"[51.4375, 24.453125, 51.4375, 51.4375, 26.0312...","[48.59375, 23.265625, 48.59375, 48.59375, 23.6...","[56.65625, 27.078125, 56.65625, 56.65625, 33.5...","[46.03125, 22.1875, 46.03125, 46.03125, 21.718...","[43.53125, 21.21875, 43.53125, 43.53125, 20.48...","[41.28125, 20.375, 41.28125, 41.28125, 19.4375...","[39.15625, 19.484375, 39.15625, 39.15625, 18.3...","[37.0, 18.65625, 37.0, 37.0, 17.515625, 19.031...","[35.125, 17.90625, 35.125, 35.125, 16.75, 17.5...",...,"[5.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 1.0, 3.0, 1.0, 2.0, 6.0, 0.0, ...","[0.0, 0.0, 1.0, 4.0, 6.0, 8.0, 6.0, 6.0, 3.0, ...","[0.0, 0.0, 0.0, 1.0, 1.0, 2.0, 1.0, 3.0, 1.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, ...",1
347,813,"[51.4375, 0.0200042724609375, 51.4375, 51.4375...","[48.59375, 0.0200042724609375, 48.59375, 48.59...","[56.65625, 0.0200042724609375, 56.65625, 56.65...","[46.03125, 0.0200042724609375, 46.03125, 46.03...","[43.53125, 0.0200042724609375, 43.53125, 43.53...","[41.28125, 0.0200042724609375, 41.28125, 41.28...","[39.15625, 0.0200042724609375, 39.15625, 39.15...","[37.0, 0.0200042724609375, 37.0, 37.0, 0.02000...","[35.125, 0.0200042724609375, 35.125, 35.125, 0...",...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 1.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 2.0, 4.0, 1.0, 4.0, 0.0, ...","[0.0, 0.0, 1.0, 0.0, 3.0, 6.0, 7.0, 6.0, 6.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 1.0, 1.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, ...",0


In [22]:
data.t4

0      0
1      0
2      0
3      1
4      0
      ..
344    0
345    0
346    0
347    0
348    0
Name: t4, Length: 349, dtype: int64

In [5]:
#overly complicated code to get various p-values using likelihood ratio test
#good for just correlations vs symptoms
def add_confounder_dose_limits(df,organ_list=None):
    #dose limits as binary values from https://applications.emro.who.int/imemrf/Rep_Radiother_Oncol/Rep_Radiother_Oncol_2013_1_1_35_48.pdf
    #not inlcudeing other stuff like eyes at this time
    #also, my max dose is weird so I'm using V10 for that because I feel like that makes sense
    #using the 
    if organ_list is None:
        organ_list = Const.organ_list[:]
    df = df.copy()
    original_cols = set(df.columns)
    getval = lambda organ,param: df[param].apply(lambda x: x[organ_list.index(organ)])
    get_lr_val = lambda organ,param: np.maximum(getval('Lt_'+organ,param),getval('Rt_'+organ,param))
    
    maxdose_var = 'max_dose'
   
    #xerostomia. >25 for 1 or >20 for both
    df['Parotid_Gland_limit'] = (get_lr_val('Parotid_Gland','mean_dose') > 20) | (getval('Lt_Parotid_Gland','mean_dose') > 25) | (getval('Rt_Parotid_Gland','mean_dose') > 25)
    
    #there is 50 for PEG tube and 60 for aspiration so i'll do 50
    for o in ['IPC','MPC',"SPC"]:
        df[o+"_limit"] = getval(o,'mean_dose') > 50
        df[o+"_limit2"] = getval(o,'mean_dose') > 60
    
    #edema
    df['Larynx_limit'] = getval('Larynx','V50') > 27
    
    #Esophagitus
    elimits = [('V35',50),('V50',40),('V70',20),('V60',30)]
    df['Esophagus_limit'] = np.stack([(getval('Esophagus',v) > lim) for v,lim in elimits]).sum(axis=0) > 0
    return df

def add_total_doses(df,cols):
    df = df.copy()
    for col in cols:
        if col in df.columns:
            df['total_'+col] = df[col].apply(np.sum)
    return df

def var_test(df, testcol, ycol,xcols, 
             boolean=True,
             regularize = False,
             scale=True):
    y = df[ycol]
    if testcol not in xcols:
        xcols = xcols + [testcol]
    x = df[xcols].astype(float)
    if regularize:
        for col in xcols:
            x[col] = (x[col] - x[col].mean())/(x[col].std()+ .01)
    if scale:
        for col in xcols:
            x[col] = (x[col] - x[col].min())/(x[col].max() - x[col].min())
    for col in xcols:
        if x[col].std() < .00001:
            x = x.drop(col,axis=1)
    x2 = x.copy()
    x2 = x2.drop(testcol,axis=1)
    if boolean:
        model = sm.Logit
        method = 'bfgs'
    else:
        model = sm.OLS
        method= 'qr'
    logit = model(y,x)
    logit_res = logit.fit(maxiter=500,
                          disp=False,
                          method=method,
                         )
    
    logit2 = model(y,x2)
    logit2_res = logit2.fit(maxiter=500,
                            disp=False,
                            method=method,
                           )
    
    llr_stat = 2*(logit_res.llf - logit2_res.llf)
    llr_p_val = chi2.sf(llr_stat,1)
    
    aic_diff = logit_res.aic - logit2_res.aic
    bic_diff = logit_res.bic - logit2_res.bic
    
    results = {
        'ttest_pval': logit_res.pvalues[testcol],
        'ttest_tval': logit_res.tvalues[testcol],
        'lrt_pval': llr_p_val,
        'aic_diff': aic_diff,
        'bic_diff': bic_diff
    }
    return results

def get_cluster_lrt(df,clust_key = 'dose_clusters',
                             symptoms=None,
                             nWeeks = None,
                             thresholds=None,
                             confounders=None,
                            ):
    #add tests for pvalues for data
    if symptoms is None:
        symptoms = Const.symptoms[:]
    if nWeeks is None:
        nWeeks = [13,59]
    if confounders is None:
        confounders = ['t4',
                       'n3',
                       'hpv',
                       'BOT',
                       'Tonsil',
                       'total_mean_dose',
#                        'Larynx_limit',
#                        'Parotid_Gland_limit'
                      ]
    date_keys = [df.dates.iloc[0].index(week) for week in nWeeks if week in df.dates.iloc[0]]
    #calculate change from baseline instead of absolute
    get_symptom_change_max = lambda x: np.max([x[d]-x[0] for d in date_keys])
    get_symptom_max = lambda x: np.max([x[d] for d in date_keys])
    df = add_confounder_dose_limits(df)
    
    tdose_cols = [c.replace('total_','') for c in confounders if 'total_' in c]
    if len(tdose_cols) > 0:
        df = add_total_doses(df,tdose_cols)
        
    clust_results = []
    for symptom in symptoms:
        skey = 'symptoms_'+symptom
        if skey not in df.columns:
            continue
        max_symptoms = df[skey].apply(get_symptom_max).values
        for threshold in [-1, 5, 7]:
            colname=  'cluster_'+symptom
            boolean = threshold > 0
            if boolean:
                y = max_symptoms >= threshold
                colname += '_'+str(threshold)
            else:
                y = max_symptoms/10
            names = ['lrt_pval','ttest_tval','ttest_pval','aic_diff']
            for n in names:
                df[colname+'_'+n] = -1
            for clust in df[clust_key].unique():
                in_clust = df[clust_key] == clust
                if len(np.unique(y)) < 2:
                    continue
                else:
                    df['x'] = in_clust
                    df['y'] = y
                    res = var_test(df,'x','y',confounders,regularize=boolean,boolean=boolean)
                    for name in names:
                        if not pd.isnull(res[name]):
                            df.loc[df[in_clust].index,[colname+'_'+name]] = res[name]
                    
    return df
        
        
tempdf = add_sd_dose_clusters(data,organ_subset = ['Tongue','Genioglossus_M','Lt_Submandibular_Gland','Rt_Submandibular_Gland'])
temp = get_cluster_lrt(tempdf)
for c,subdf in temp.groupby('dose_clusters'):
    print(c,subdf[['cluster_' + s + '_7_aic_diff' for s in ['pain','drymouth','taste']]].mean())
    print(c,subdf[['cluster_' + s + '_7_lrt_pval' for s in ['pain','drymouth','taste']]].mean())

0 cluster_pain_7_aic_diff       -6.654488
cluster_drymouth_7_aic_diff    0.381363
cluster_taste_7_aic_diff       1.726519
dtype: float64
0 cluster_pain_7_lrt_pval        0.003263
cluster_drymouth_7_lrt_pval    0.203282
cluster_taste_7_lrt_pval       0.601006
dtype: float64
1 cluster_pain_7_aic_diff        0.017636
cluster_drymouth_7_aic_diff    0.037218
cluster_taste_7_aic_diff       1.465429
dtype: float64
1 cluster_pain_7_lrt_pval        0.159142
cluster_drymouth_7_lrt_pval    0.161216
cluster_taste_7_lrt_pval       0.464691
dtype: float64
2 cluster_pain_7_aic_diff       -1.057089
cluster_drymouth_7_aic_diff    0.882254
cluster_taste_7_aic_diff      -5.083558
dtype: float64
2 cluster_pain_7_lrt_pval        0.080386
cluster_drymouth_7_lrt_pval    0.290404
cluster_taste_7_lrt_pval       0.007779
dtype: float64
3 cluster_pain_7_aic_diff        1.976325
cluster_drymouth_7_aic_diff   -1.231766
cluster_taste_7_aic_diff      -5.906187
dtype: float64
3 cluster_pain_7_lrt_pval        0.877716

In [6]:
#old stuff 
def get_cluster_correlations(df,clust_key = 'dose_clusters',
                             symptoms=None,
                             nWeeks = None,
                             thresholds=None,
                             baselines=[False],
                            ):
    #add tests for pvalues for data
    if symptoms is None:
        symptoms = Const.symptoms[:]
    if nWeeks is None:
        nWeeks = [13,33]
    if thresholds is None:
        thresholds = [5,7]
    date_keys = [df.dates.iloc[0].index(week) for week in nWeeks if week in df.dates.iloc[0]]
    #calculate change from baseline instead of absolute
    get_symptom_change_max = lambda x: np.max([x[d]-x[0] for d in date_keys])
    get_symptom_max = lambda x: np.max([x[d] for d in date_keys])
    df = df.copy()
    clust_results = []
    
    for symptom in symptoms:
        skey = 'symptoms_'+symptom
        if skey not in df.columns:
            continue
        max_symptoms = df[skey].apply(get_symptom_max).values
        max_change = df[skey].apply(get_symptom_change_max).values
        for threshold in thresholds:
            for baseline in baselines:
                if baseline:
                    y = (max_change >= threshold).astype(int)
                else:
                    y = (max_symptoms >= threshold).astype(int)
                colname=  'cluster_'+symptom
                if baseline:
                    colname += '_change'
                colname += "_" + str(threshold)
                df[colname+'_odds_ratio'] = -1
                df[colname+'_pval'] = -1
                for clust in df[clust_key].unique():
                    in_clust = df[clust_key] == clust
                    if len(np.unique(y)) < 2:
                        (odds_ratio,pval) = (0,1)
                    else:
                        (odds_ratio, pval) = Metrics.boolean_fisher_exact(in_clust.astype(int),y)
                    df.loc[df[in_clust].index,[colname+'_odds_ratio']] = odds_ratio
                    df.loc[df[in_clust].index,[colname+'_pval']] = pval
    return df
        
        
tempdf = add_sd_dose_clusters(data,organ_subset = ['IPC','SPC',"MPC"])
temp = get_cluster_correlations(tempdf,thresholds=[5,7,9],baselines=[False])
temp[[c for c in temp.columns if '_odds_ratio' in c or '_pval' in c]]

Unnamed: 0,cluster_pain_5_odds_ratio,cluster_pain_5_pval,cluster_pain_7_odds_ratio,cluster_pain_7_pval,cluster_pain_9_odds_ratio,cluster_pain_9_pval,cluster_fatigue_5_odds_ratio,cluster_fatigue_5_pval,cluster_fatigue_7_odds_ratio,cluster_fatigue_7_pval,...,cluster_walking_7_odds_ratio,cluster_walking_7_pval,cluster_walking_9_odds_ratio,cluster_walking_9_pval,cluster_enjoy_5_odds_ratio,cluster_enjoy_5_pval,cluster_enjoy_7_odds_ratio,cluster_enjoy_7_pval,cluster_enjoy_9_odds_ratio,cluster_enjoy_9_pval
0,0.907580,0.863563,0.508136,0.408370,0.000000,1.000000,1.106647,0.773378,1.873357,0.107960,...,0.746795,1.000000,1.132075,1.000000,1.429213,0.311895,1.617739,0.339246,0.669231,0.761387
1,0.959707,1.000000,0.891304,1.000000,0.000000,1.000000,1.556158,0.245867,0.625287,0.595391,...,1.265217,0.673787,3.180851,0.359344,1.305000,0.502983,2.544643,0.099457,4.258721,0.021882
2,0.875897,0.857016,0.913858,1.000000,2.771739,0.462504,0.759386,0.452806,0.752093,0.675471,...,1.393258,0.739818,0.000000,0.567834,1.157111,0.725509,0.415789,0.213492,0.820000,1.000000
3,0.959707,1.000000,0.891304,1.000000,0.000000,1.000000,1.556158,0.245867,0.625287,0.595391,...,1.265217,0.673787,3.180851,0.359344,1.305000,0.502983,2.544643,0.099457,4.258721,0.021882
4,0.907580,0.863563,0.508136,0.408370,0.000000,1.000000,1.106647,0.773378,1.873357,0.107960,...,0.746795,1.000000,1.132075,1.000000,1.429213,0.311895,1.617739,0.339246,0.669231,0.761387
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
344,1.267442,0.485186,1.977541,0.256257,2.470000,0.495636,0.873358,0.769509,0.802867,0.686669,...,0.812925,1.000000,1.230000,1.000000,0.447312,0.058155,0.526919,0.334078,0.435262,0.361533
345,0.875897,0.857016,0.913858,1.000000,2.771739,0.462504,0.759386,0.452806,0.752093,0.675471,...,1.393258,0.739818,0.000000,0.567834,1.157111,0.725509,0.415789,0.213492,0.820000,1.000000
346,0.875897,0.857016,0.913858,1.000000,2.771739,0.462504,0.759386,0.452806,0.752093,0.675471,...,1.393258,0.739818,0.000000,0.567834,1.157111,0.725509,0.415789,0.213492,0.820000,1.000000
347,0.875897,0.857016,0.913858,1.000000,2.771739,0.462504,0.759386,0.452806,0.752093,0.675471,...,1.393258,0.739818,0.000000,0.567834,1.157111,0.725509,0.415789,0.213492,0.820000,1.000000


In [7]:
def keyword_clusterer(cluster_type, n_clusters,**kwargs):
    clusterer = None
    if cluster_type.lower() == 'bgmm':
        clusterer = BayesianGaussianMixture(n_init=5,
                                            n_components=n_clusters, 
                                            covariance_type="full",
                                            random_state=100)
    if cluster_type.lower() == 'gmm':
        clusterer = GaussianMixture(n_init=5,
                                    n_components=n_clusters, 
                                    covariance_type="full",
                                    random_state=100)
    if cluster_type.lower() == 'spectral':
        clusterer = SpectralClustering(n_clusters=n_clusters)
    if cluster_type.lower() == 'kmeans':
        clusterer = KMeans(n_clusters=n_clusters,max_iter=1000)
    if cluster_type.lower() == 'ward':
        clusterer = AgglomerativeClustering(n_clusters=n_clusters,
                                            linkage='ward')
    return clusterer

def get_cluster_json(df,
                     organ_list=None,
                     quantiles = None,
                     sdates = [13,33],
                     other_values = None,
                     add_metrics = True,
                     clustertype = None,
                     confounders=None,
                     n_clusters = 4,
                     **kwargs):
    if organ_list is None:
        organ_list = Const.organ_list[:]
    clusterer = None
    if clustertype is not None:
        clusterer = keyword_clusterer(clustertype,n_clusters)
    df = add_sd_dose_clusters(df.copy(),
                              organ_subset = organ_list,
                              clusterer=clusterer,
                              n_clusters = n_clusters,
                              **kwargs)
    clust_dfs = []
    dose_cols = get_df_dose_cols(df,key='V') + ['mean_dose','volume']
    s_cols = get_df_symptom_cols(df)
    if quantiles is None:
        quantiles = np.linspace(.1,.9,6) 
    dates = df.dates.iloc[0]
    date_positions = [(sdate, dates.index(sdate)) for sdate in sdates if sdate in dates]
    #i'm asuming these are discrete
    if other_values is None:
        other_values = [
            'subsite',
            'n_stage','t_stage',
            'os',
            'hpv',
            'is_male',
            'chemotherapy','concurrent','ic','rt',
            'digest_increase'
        ]
    #adds in pvalues and odds ratio
    stats_cols=[]
    if add_metrics:
        old_cols = df.columns
        df = get_cluster_correlations(df,
                                      thresholds=[5,7],
                                      clust_key='dose_clusters',
                                      baselines=[False],
                                      nWeeks=sdates)
        df = get_cluster_lrt(df,
                              clust_key='dose_clusters',
                             confounders=confounders,
                              nWeeks=sdates)
        stats_cols =sorted(set(df.columns) - set(old_cols))
    df = df.reset_index()
    for c,subdf in df.groupby('dose_clusters'):
        clust_entry = {
            'cluster_size': subdf.shape[0],
            'dates':dates,
            'ids': subdf.id.values.tolist(),
            'clusterId': c,
            }
        
        for organ in Const.organ_list:
            opos = Const.organ_list.index(organ)
            for dcol in dose_cols:
#                 print(dcol,len(subdf[dcol].iloc[0]),len(Const.organ_list))
                vals = subdf[dcol].apply(lambda x: x[opos])
                qvals = vals.quantile(quantiles)
                clust_entry[organ+'_'+dcol] = qvals.values.astype(float).tolist()
            
        for scol in s_cols:
            sname = scol.replace('symptoms_','')
            clust_entry[sname] = subdf[scol].apply(lambda x: [int(i) for i in x]).values.tolist()
        for col in other_values:
            unique = df[col].unique()
            entry = {}
            for val in unique:
                clust_entry[col+'_'+str(val)] = float((subdf[col] == val).sum())
                clust_entry[col+'_'+str(val)+'_mean'] = float((subdf[col] == val).mean())
        for statcol in stats_cols:
            val = subdf[statcol].iloc[0]
            clust_entry[statcol] = val
        clust_dfs.append(clust_entry)
    return clust_dfs

test = get_cluster_json(data,
                        clustertype='kmeans',
                        organ_list = ['IPC','SPC','MPC'])
[{k:v for k,v in i.items() if 'cluster_' in k} for i in test]

[{'cluster_size': 95,
  'cluster_activity_5_aic_diff': -3.406896216409109,
  'cluster_activity_5_lrt_pval': 0.020057347633955433,
  'cluster_activity_5_odds_ratio': 0.8979223125564589,
  'cluster_activity_5_pval': 0.8692185035387496,
  'cluster_activity_5_ttest_pval': 0.026108095145790756,
  'cluster_activity_5_ttest_tval': -2.224600111636403,
  'cluster_activity_7_aic_diff': -3.715615142619839,
  'cluster_activity_7_lrt_pval': 0.016814672016208353,
  'cluster_activity_7_odds_ratio': 1.1832386363636365,
  'cluster_activity_7_pval': 0.8086749026605207,
  'cluster_activity_7_ttest_pval': 0.02714538215134701,
  'cluster_activity_7_ttest_tval': -2.2094208495748617,
  'cluster_activity_aic_diff': 0.06214982472488373,
  'cluster_activity_lrt_pval': 0.1639021489450042,
  'cluster_activity_ttest_pval': 0.16850325050209322,
  'cluster_activity_ttest_tval': 1.3799507295957798,
  'cluster_appetite_5_aic_diff': -0.681519253448414,
  'cluster_appetite_5_lrt_pval': 0.10151883543944382,
  'cluster_ap

In [8]:
def sddf_to_json(df,
                 to_drop =None,
                 add_pca = True,
                 dose_pca_features = None,
                ):
    if to_drop is None:
        to_drop = ['min_dose','is_ajcc_8th_edition']
    df = df.copy().fillna(0)
    df['totalDose'] = df['mean_dose'].apply(np.sum)
    df['organList'] = [Const.organ_list[:] for i in range(df.shape[0])]
    if add_pca:
        if dose_pca_features is None:
            dose_pca_features = ['V35','V40','V45','V50','V55','V60','V65']
        dose_x = np.stack(df[dose_pca_features].apply(lambda x: np.stack(x).ravel(),axis=1).values)
        dose_x_pca = PCA(3).fit_transform(dose_x)
        df['dose_pca'] = [x.tolist() for x in dose_x_pca]

        symptom_cols = [c for c in df.columns if 'symptoms_' in c and 'original' not in c] 
        valid_sd = [i for i,date in enumerate(df.dates.iloc[0]) if date <= 33]
        late_sd = [i for i,date in enumerate(df.dates.iloc[0]) if date <= 33 and date > 7]
        treatment_sd = [i for i,date in enumerate(df.dates.iloc[0]) if date <= 7]
        for name, pos_list in zip(['all','post','treatment'],[valid_sd,late_sd,treatment_sd]):
            symptom_x = np.stack(df[symptom_cols].apply(lambda x: np.stack([x[i] for i in pos_list]).ravel(),axis=1).values)
            symptom_x_pca = PCA(3).fit_transform(symptom_x)
            df['symptom_'+name+'_pca'] = [x.tolist() for x in symptom_x_pca]
    
    is_dose_dvh = lambda x: re.match('D[0-9][0-9]?',x) is not None
    vol_dvh_too_high = lambda x: re.match('V[0-18-9][0-9]?',x) is not None
    for c in df.columns:
        if is_dose_dvh(c) or vol_dvh_too_high(c):
            to_drop.append(c)
        if 'symptoms' in c and 'original' in c:
            to_drop.append(c)
        if '_max_' in c:
            to_drop.append(c)
    df = df.drop(to_drop,axis=1)
    ddict = df.reset_index().to_dict(orient='records')
    return ddict

sddf_to_json(data)

[{'index': 0,
  'id': 7,
  'V20': [61.5,
   100.0,
   100.0,
   100.0,
   100.0,
   100.0,
   100.0,
   100.0,
   67.625,
   100.0,
   100.0,
   100.0,
   100.0,
   54.46875,
   100.0,
   89.0,
   58.25,
   99.875,
   95.3125,
   33.25,
   100.0,
   100.0,
   73.625,
   64.1875,
   100.0,
   100.0,
   100.0,
   100.0,
   100.0,
   100.0,
   100.0,
   100.0,
   100.0,
   100.0,
   99.3125,
   88.6875,
   82.3125,
   93.8125,
   33.125,
   100.0],
  'V25': [58.03125,
   98.5,
   100.0,
   100.0,
   100.0,
   100.0,
   100.0,
   100.0,
   53.90625,
   100.0,
   100.0,
   100.0,
   100.0,
   50.65625,
   100.0,
   75.4375,
   48.59375,
   99.5,
   74.1875,
   27.5,
   100.0,
   100.0,
   61.5,
   54.5625,
   100.0,
   100.0,
   100.0,
   100.0,
   100.0,
   100.0,
   100.0,
   100.0,
   100.0,
   100.0,
   96.25,
   84.4375,
   60.25,
   72.875,
   6.78125,
   100.0],
  'V30': [49.0625,
   90.6875,
   100.0,
   100.0,
   100.0,
   99.75,
   100.0,
   100.0,
   35.25,
   90.9375,
   88.75,


In [17]:

def add_late_symptoms(df,symptoms=None):
    df = df.copy()
    if symptoms is None:
        symptoms = Const.symptoms[:]
    date_idxs = [i for i,v in enumerate(df.dates.iloc[0]) if v > 12 and v < 35]
    for symptom in symptoms:
        mval = df['symptoms_'+symptom].apply(lambda x: np.max([x[i] for i in date_idxs]))
        df[symptom+'_late'] = mval
    return df


def multi_var_tests(df, testcols, ycol,xcols, 
             boolean=True,
             regularize = False,
             scale=True):
    y = df[ycol]
    xcols = list(set(xcols).union(set(testcols)))
    x = df[xcols].astype(float)
    if regularize:
        for col in xcols:
            x[col] = (x[col] - x[col].mean())/(x[col].std()+ .01)
    if scale:
        for col in xcols:
            x[col] = (x[col] - x[col].min())/(x[col].max() - x[col].min())
    for col in xcols:
        if x[col].std() < .00001:
            x = x.drop(col,axis=1)
    x2 = x.copy()
    x2 = x2.drop(testcols,axis=1)
    if boolean:
        model = sm.Logit
        method = 'bfgs'
    else:
        model = sm.OLS
        method= 'qr'
    logit = model(y,x)
    logit_res = logit.fit(maxiter=500,
                          disp=False,
                          method=method,
                         )
    
    logit2 = model(y,x2)
    logit2_res = logit2.fit(maxiter=500,
                            disp=False,
                            method=method,
                           )
    
    llr_stat = 2*(logit_res.llf - logit2_res.llf)
    llr_p_val = chi2.sf(llr_stat,len(testcols))
    
    aic_diff = logit_res.aic - logit2_res.aic
    bic_diff = logit_res.bic - logit2_res.bic
    
    results = {
        'lrt_pval': llr_p_val,
        'aic_diff': aic_diff,
        'bic_diff': bic_diff
    }
    for testcol in testcols:
        results['ttest_pval_' + str(testcol)]= logit_res.pvalues[testcol]
        results['ttest_tval_' + str(testcol)]= logit_res.tvalues[testcol]
    return results

def select_single_organ_cluster_effects(df,
                                        symptoms=None,
                                        base_organs=None,
                                        covars=None,
                                        n_clusters=4,
                                        clustertype=None,
                                        threshold=None,
                                        drop_base_cluster=True,
                                        features=None,
                                        organ_list=None):
    if base_organs is None:
        base_organs = []
    if organ_list is None:
        #imma just skip stuff that's like probably not relevant for this usage
        exclude = set(['Brainstem',"Spinal_Cord",
                   'Lt_Brachial_Plexus','Rt_Brachial_Plexus',
                   'Lower_Lip',"Upper_Lip",
                   'Hyoid_bone','Mandible',
                   'Cricoid_cartilage',
                    'Thyroid_cartilage',
                  ])
        organ_list = [o for o in Const.organ_list if o not in exclude]
    if symptoms is None:
        symptoms=Const.symptoms[:]
    if isinstance(symptoms,str):
        symptoms=[symptoms]
    df = add_late_symptoms(df,symptoms)
    df = add_confounder_dose_limits(df)
    olists = [base_organs] if len(base_organs) > 0 else []
    for o in organ_list:
        if o in base_organs:
            continue
        if 'Rt_' in o:
            continue
        new_list = [o]
        if len(base_organs) > 0:
            new_list = new_list + base_organs
        if 'Lt_' in o:
            new_list.append(o.replace('Lt_','Rt_'))
        if len(new_list) > len(base_organs):
            olists.append(new_list)
    if covars is None:
        covars = [
            'Parotid_Gland_limit',
          'IPC_limit','MPC_limit','SPC_limit',
          't4','n3','hpv','total_dose',
          "BOT","Tonsil",
         ]
    df = df.copy()
    df['total_dose'] = df.mean_dose.apply(lambda x: np.sum(x))
    results = []
    base_pval = 1
    completed_clusters = set([])
    
    clusterer = None
    if clustertype is not None:
        clusterer = keyword_clusterer(clustertype,n_clusters)
        
    for olist in olists:
        prefix = '_'.join(olist)+'_'
        df  = add_sd_dose_clusters(df,
                                     features = features,
                                     organ_subset=olist,
                                     prefix=prefix,
                                    clusterer=clusterer,
                                     n_clusters=n_clusters,
            )
        clustname = prefix+'dose_clusters'
        xvals = []
        for cval in df[clustname].unique():
            if cval == 0 and drop_base_cluster:
                continue
            df['x'+str(cval)] = (df[clustname] == cval).astype(int)
            xvals.append('x'+str(cval))
        for symptom in symptoms:
            outcome = symptom + '_late'
            if threshold is None:
                df['y'] = df[outcome]
            else:
                df['y'] = (df[outcome] >= threshold)
            res = multi_var_tests(df,xvals,'y',covars,boolean=(threshold is not None))
            entry = {
                'outcome':outcome,
                'base_organs':base_organs,
                'added_organs':sorted(set(olist)-set(base_organs)),
                'threshold':threshold,
                'clustertype':clustertype,
            }
            if ''.join(olist) == ''.join(base_organs):
                base_pval = res['lrt_pval']
            entry['pval_change'] = base_pval - res['lrt_pval']
            for k,v in res.items():
                entry[k] = v
            results.append(entry)
    #sort by effect size of highest-dose cluster
    results= sorted(results,key=lambda x: -x['ttest_tval_x'+str(n_clusters-1)])
    return results

test = select_single_organ_cluster_effects(data,
                                           'drymouth',
#                                            threshold=5,
                                           clustertype='ward',
                                           base_organs=[])
test

[{'outcome': 'drymouth_late',
  'base_organs': [],
  'added_organs': ['Hard_Palate'],
  'pval_change': 0.9866997199780493,
  'lrt_pval': 0.013300280021950687,
  'aic_diff': -4.726547020372209,
  'bic_diff': 6.838668746235044,
  'ttest_pval_x1': 0.009873690973200693,
  'ttest_tval_x1': 2.595001852182821,
  'ttest_pval_x2': 0.6106095574380528,
  'ttest_tval_x2': 0.5096810065288481,
  'ttest_pval_x3': 0.021764882196522957,
  'ttest_tval_x3': 2.305188747750022},
 {'outcome': 'drymouth_late',
  'base_organs': [],
  'added_organs': ['Esophagus'],
  'pval_change': 0.784183713468346,
  'lrt_pval': 0.2158162865316539,
  'aic_diff': 1.539229632302522,
  'bic_diff': 13.104445398909775,
  'ttest_pval_x3': 0.061419630282053134,
  'ttest_tval_x3': 1.8767440197353862,
  'ttest_pval_x2': 0.3942055244726669,
  'ttest_tval_x2': 0.8531109294266779,
  'ttest_pval_x1': 0.5108376974137857,
  'ttest_tval_x1': -0.6582352906849689},
 {'outcome': 'drymouth_late',
  'base_organs': [],
  'added_organs': ['Lt_Late

In [14]:
simplejson.loads(simplejson.dumps(test))

[{'outcome': 'drymouth_late',
  'base_organs': [],
  'added_organs': ['Hard_Palate'],
  'pval_change': 0.9324846927503768,
  'lrt_pval': 0.06751530724962314,
  'aic_diff': -1.1417185034581507,
  'bic_diff': 10.423497263149102,
  'ttest_pval_x1': 0.07907697190922815,
  'ttest_tval_x1': 1.7614190500549687,
  'ttest_pval_x2': 0.3234084331218686,
  'ttest_tval_x2': 0.9889311615736106,
  'ttest_pval_x3': 0.024457381183960385,
  'ttest_tval_x3': 2.2600589799350477},
 {'outcome': 'drymouth_late',
  'base_organs': [],
  'added_organs': ['Esophagus'],
  'pval_change': 0.8578759866267547,
  'lrt_pval': 0.14212401337324526,
  'aic_diff': 0.5575902020420926,
  'bic_diff': 12.122805968649345,
  'ttest_pval_x1': 0.6896248060111695,
  'ttest_tval_x1': 0.3997090323239194,
  'ttest_pval_x3': 0.04210083902045543,
  'ttest_tval_x3': 2.0403108392389395,
  'ttest_pval_x2': 0.16520262118156948,
  'ttest_tval_x2': 1.3908126628177466},
 {'outcome': 'drymouth_late',
  'base_organs': [],
  'added_organs': ['Lar

In [None]:
re.match('^x[0-9]*$','xx100') is not None

In [None]:
test = [1,2,3,4]
test.extend(1)
test