In [1]:
import pandas as pd
import numpy as np
import re
from Utils import *

##load file names
csv = './Anonymized_644.Updated_cleaned_v1.3.2.tsv'
adjacency_file = './connectivity_646.csv'
adjacency = pd.read_csv(adjacency_file, index_col = 0)
node_list = sorted(adjacency.columns)
print(node_list)

['1A', '1B', '2A', '2B', '3', '4', '5A', '5B', '6', 'RPLN']


In [2]:
adjacency.head()

Unnamed: 0,1A,1B,2A,2B,3,4,5A,5B,6,RPLN
1A,1,1,0,0,0,0,0,0,1,0
1B,1,1,1,0,1,0,0,0,0,0
2A,0,1,1,1,1,0,0,0,0,0
2B,0,0,1,1,0,0,1,0,0,0
3,0,1,1,0,1,1,1,0,1,0


In [3]:
#extract all the node names and such
left_nodes = ['L'+n for n in adjacency.columns]
right_nodes = ['R'+n for n in adjacency.columns]
rpln = ['RRPLN', 'LRPLN']
nodes = left_nodes + right_nodes
all_nodes = set(nodes)
node_to_index = {word: position for position, word in enumerate(nodes)}
node_to_index

{'L1A': 0,
 'L1B': 1,
 'L2A': 2,
 'L2B': 3,
 'L3': 4,
 'L4': 5,
 'L5A': 6,
 'L5B': 7,
 'L6': 8,
 'LRPLN': 9,
 'R1A': 10,
 'R1B': 11,
 'R2A': 12,
 'R2B': 13,
 'R3': 14,
 'R4': 15,
 'R5A': 16,
 'R5B': 17,
 'R6': 18,
 'RRPLN': 19}

In [4]:
#helper functions
def parse_lymph_nodes(node_string):
    #the data apparently has just '2' when theres a '2A' and '2B'
    node_string = re.sub('L2,*','L2A, L2B,', node_string)
    node_string = re.sub('R2,*','R2A, R2B,', node_string)
    node_string = re.sub('R RPLN', 'RRPLN', node_string)
    node_string = re.sub('L RPLN', 'LRPLN', node_string)
    nodes = [n.strip() for n in node_string.split(',')]
    #remove the node with 'in-between' labeled nodes?
    ambiguous_nodes = set(['2/3','3/4','2/3/4','/3','2/','-R4'])

    for n in nodes:
        if n in ambiguous_nodes:
            return np.NaN
    nodes = [n for n in nodes if n in all_nodes]
    return nodes if len(nodes) > 0 else np.NaN

data_columns = ['Dummy ID', 
               'Affected Lymph node UPPER',
               'Feeding tube 6m', 
               'Aspiration rate(Y/N)',
               'Age at Diagnosis (Calculated)',
               'Pathological Grade',
               'Gender',
               'Race',
                'Total dose',
                'Total fractions',
               'Tm Laterality (R/L)',
               'Tumor subsite (BOT/Tonsil/Soft Palate/Pharyngeal wall/GPS/NOS)',
               'T-category', 'N-category', 'HPV/P16 status', 
               'AJCC 7th edition','AJCC 8th edition','Smoking status (Packs/Year)']

data = pd.read_csv(csv, sep='\t' , index_col=0, 
                   usecols=data_columns,
                   dtype = {'Affected Lymph node UPPER': str}).dropna(subset=['Affected Lymph node UPPER'])
data['Affected Lymph node UPPER'] = data['Affected Lymph node UPPER'].apply(parse_lymph_nodes)
data = data.dropna(subset=['Affected Lymph node UPPER'])
print(data.shape)
data.head()

(593, 17)


Unnamed: 0_level_0,Age at Diagnosis (Calculated),Pathological Grade,Gender,Race,Tm Laterality (R/L),Tumor subsite (BOT/Tonsil/Soft Palate/Pharyngeal wall/GPS/NOS),Affected Lymph node UPPER,HPV/P16 status,T-category,N-category,AJCC 7th edition,AJCC 8th edition,Smoking status (Packs/Year),Total dose,Total fractions,Feeding tube 6m,Aspiration rate(Y/N)
Dummy ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,57.019444,III,Male,White/Caucasion,R,Tonsil,"[R2A, R2B, R3]",Positive,T1,N2,IV,II,30,66.0,30,N,N
2,55.855556,III,Female,White/Caucasion,R,BOT,"[R2A, R2B]",Positive,T2,N2,IV,II,0,66.0,30,N,N
3,60.222222,II,Female,White/Caucasion,L,Tonsil,"[L2A, L2B, L3]",Positive,T2,N2,IV,II,0,70.0,33,N,N
4,66.344444,III,Male,White/Caucasion,R,BOT,"[R2A, R2B]",Positive,T1,N1,III,I,0,66.0,30,N,N
5,49.733333,II,Male,White/Caucasion,R,BOT,[R3],Positive,T4,N2,IV,III,0,69.96,33,N,N


In [5]:
monograms = pd.DataFrame(index = data.index, columns = nodes, dtype = np.int32).fillna(0)
for pos, p in enumerate(data['Affected Lymph node UPPER']):
    index = data.index[pos]
    for lymph_node in p:
        monograms.loc[index, lymph_node] = 1
# monograms.LRPLN *= -1
# monograms.RRPLN *= -1
print(monograms.head(5))
monograms.sum().transpose()

          L1A  L1B  L2A  L2B   L3   L4  L5A  L5B   L6  LRPLN  R1A  R1B  R2A  \
Dummy ID                                                                      
1         0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0    0.0  0.0  0.0  1.0   
2         0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0    0.0  0.0  0.0  1.0   
3         0.0  0.0  1.0  1.0  1.0  0.0  0.0  0.0  0.0    0.0  0.0  0.0  0.0   
4         0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0    0.0  0.0  0.0  1.0   
5         0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0    0.0  0.0  0.0  0.0   

          R2B   R3   R4  R5A  R5B   R6  RRPLN  
Dummy ID                                       
1         1.0  1.0  0.0  0.0  0.0  0.0    0.0  
2         1.0  0.0  0.0  0.0  0.0  0.0    0.0  
3         0.0  0.0  0.0  0.0  0.0  0.0    0.0  
4         1.0  0.0  0.0  0.0  0.0  0.0    0.0  
5         0.0  1.0  0.0  0.0  0.0  0.0    0.0  


L1A        3.0
L1B       22.0
L2A      322.0
L2B      322.0
L3       152.0
L4        31.0
L5A        7.0
L5B        5.0
L6         0.0
LRPLN     22.0
R1A        2.0
R1B       21.0
R2A      362.0
R2B      362.0
R3       158.0
R4        44.0
R5A        7.0
R5B        4.0
R6         0.0
RRPLN     30.0
dtype: float64

In [6]:
bigram_set = set([])

for i, name in enumerate(node_list):
    for i2 in range(i+1, len(node_list)):
        if adjacency.iloc[i,i2] > 0:
            bigram_set.add(name + node_list[i2])
' '.join(sorted(bigram_set))
bigram_names = (sorted(bigram_set))
bigram_names

['1A1B',
 '1A6',
 '1B2A',
 '1B3',
 '2A2B',
 '2A3',
 '2B5A',
 '34',
 '35A',
 '36',
 '45B',
 '46',
 '5A5B']

In [7]:
def bigramize(v, side):
    #shoudl take a unilateral (left or right) matrix of affected lypmh nnodes
    assert(v.shape[1] == adjacency.shape[1])
    col_names = list(v.columns)
    clean = lambda x:  re.sub('^[LR]\s*','', x)
    bigrams = []
    names = []
    for i, colname in enumerate(col_names):
        nodename = clean(colname)
        for i2 in range(i+1, v.shape[1]):
            colname2 = col_names[i2]
            bigram_name = nodename + clean(colname2)
            if bigram_name in bigram_set:
                if bigram_name not in names:
                    names.append(side + bigram_name)
                bigram_vector = v[colname].values * v[colname2].values
                bigrams.append(bigram_vector.reshape(-1,1))
    print(names)
    return pd.DataFrame(np.hstack(bigrams), columns = names, index = data.index)
            
l_bigrams = bigramize(monograms.loc[:, left_nodes], 'L')
r_bigrams = bigramize(monograms.loc[:, right_nodes], 'R')

['L1A1B', 'L1A6', 'L1B2A', 'L1B3', 'L2A2B', 'L2A3', 'L2B5A', 'L34', 'L35A', 'L36', 'L45B', 'L46', 'L5A5B']
['R1A1B', 'R1A6', 'R1B2A', 'R1B3', 'R2A2B', 'R2A3', 'R2B5A', 'R34', 'R35A', 'R36', 'R45B', 'R46', 'R5A5B']


In [8]:
clean_names = lambda x: [re.sub('^[LR]\s*','',x) for x in x.columns]
clean_names_string = lambda x: ''.join(clean_names(x))
assert(clean_names_string(l_bigrams) == clean_names_string(r_bigrams))
assert(np.all(l_bigrams.index == r_bigrams.index))
dual_bigrams = pd.DataFrame(l_bigrams.values + r_bigrams.values, 
                       columns = clean_names(l_bigrams),
                       index = l_bigrams.index)

In [9]:
dual_bigrams.head()

Unnamed: 0_level_0,1A1B,1A6,1B2A,1B3,2A2B,2A3,2B5A,34,35A,36,45B,46,5A5B
Dummy ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
assert(clean_names_string(monograms.loc[:,left_nodes]) == clean_names_string(monograms.loc[:,right_nodes]))
dual_monograms = pd.DataFrame(monograms.loc[:,left_nodes].values + monograms.loc[:,right_nodes].values,
             columns = clean_names(monograms.loc[:,left_nodes]),
             index = monograms.index)
dual_monograms.head()

Unnamed: 0_level_0,1A,1B,2A,2B,3,4,5A,5B,6,RPLN
Dummy ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [11]:
db = pd.concat([data, dual_monograms, dual_bigrams], axis = 1)
print(db.shape)
db.head()

(593, 40)


Unnamed: 0_level_0,Age at Diagnosis (Calculated),Pathological Grade,Gender,Race,Tm Laterality (R/L),Tumor subsite (BOT/Tonsil/Soft Palate/Pharyngeal wall/GPS/NOS),Affected Lymph node UPPER,HPV/P16 status,T-category,N-category,...,1B3,2A2B,2A3,2B5A,34,35A,36,45B,46,5A5B
Dummy ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,57.019444,III,Male,White/Caucasion,R,Tonsil,"[R2A, R2B, R3]",Positive,T1,N2,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,55.855556,III,Female,White/Caucasion,R,BOT,"[R2A, R2B]",Positive,T2,N2,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,60.222222,II,Female,White/Caucasion,L,Tonsil,"[L2A, L2B, L3]",Positive,T2,N2,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,66.344444,III,Male,White/Caucasion,R,BOT,"[R2A, R2B]",Positive,T1,N1,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,49.733333,II,Male,White/Caucasion,R,BOT,[R3],Positive,T4,N2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
db['FT'] = db["Feeding tube 6m"] == 'Y'
db['AS'] = db['Aspiration rate(Y/N)'] == 'Y'

In [30]:
default_clusterer = lambda : FClusterer(4, dist_func = l2, link='ward')
def get_correlation(input_df,print_out = False):
    fc = default_clusterer()
    labels = fc.fit_predict(input_df.values)
    for tox in ['FT','AS']:
        pval = chi2_contingency(get_contingency_table(labels, db[tox]))[1]
        if print_out:
            print(tox, pval)
    return labels

In [32]:
spatial_labels = get_correlation(pd.concat([dual_bigrams, dual_monograms],axis = 1), True)

FT 0.0001348696115123915
AS 0.00019586175234622652


In [33]:
nonspatial_labels = get_correlation(dual_monograms, True)

FT 0.0063030120432415885
AS 2.6287776155447785e-05


In [35]:
def cluster_summary(cluster_labels):
    outcome_names = ['FT','AS']
    labels = np.unique(cluster_labels)
    cluster_df_data = {}
    
    def get_stats(y, outcome, label, only_percent = False):
        x = cluster_labels == label
        try:
            pval = fisher_exact_test(cluster_labels, y)
        except:
            pval = 0
        count = y.sum()
        percent = np.round(100*count/x.sum())
        if only_percent:
            return {outcome + ' percent': percent}
        vals = {outcome + ' pval': pval,
               outcome + ' count': count,
               outcome + ' percent': percent}
        return vals
    
    cols = ['FT','AS','N-category']
    for label in labels:
        args = np.argwhere(cluster_labels == label).ravel()
        subset_df = db.loc[db.index[args], cols]
        subset_oh = pd.get_dummies(subset_df)
        data_dict = {}
        data_dict['AA Total Patients'] = len(args)
        for col in subset_oh.columns:
            values = subset_oh[col].values
            if col in ['FT', 'AS']:
                vals = get_stats(values, col, label)
            else:
                vals = get_stats(values, col, label, True)
            for k,v in vals.items():
                data_dict[k] = v
        cluster_df_data[label] = data_dict
        
    return pd.DataFrame(cluster_df_data).T
sclusters = cluster_summary(spatial_labels)
sclusters

Unnamed: 0,AA Total Patients,FT pval,FT count,FT percent,AS pval,AS count,AS percent,N-category_N0 percent,N-category_N1 percent,N-category_N2 percent,N-category_N3 percent
1,96.0,0.057304,28.0,29.0,0.379787,30.0,31.0,2.0,1.0,92.0,5.0
2,141.0,0.727884,20.0,14.0,1.0,19.0,13.0,,6.0,90.0,4.0
3,80.0,0.348014,21.0,26.0,0.618057,17.0,21.0,2.0,2.0,92.0,2.0
4,276.0,0.0,33.0,12.0,0.0,35.0,13.0,2.0,22.0,74.0,2.0


In [36]:
cluster_summary(nonspatial_labels)

Unnamed: 0,AA Total Patients,FT pval,FT count,FT percent,AS pval,AS count,AS percent,N-category_N0 percent,N-category_N1 percent,N-category_N2 percent,N-category_N3 percent
1,65.0,0.914123,19.0,29.0,0.584589,25.0,38.0,3.0,2.0,91.0,5.0
2,45.0,0.108372,11.0,24.0,0.310171,7.0,16.0,2.0,,96.0,2.0
3,200.0,0.0,36.0,18.0,0.0,30.0,15.0,,7.0,88.0,6.0
4,283.0,0.0,36.0,13.0,0.0,39.0,14.0,2.0,20.0,76.0,1.0
