In [1]:
import pandas as pd
import numpy as np
from Constants import *
import re
from glob import glob
from re import findall, match, sub
import json

In [2]:
class Const():
    #for troubleshooting, will migrate to own file
    data_dir = "../data/" #private data
    resource_dir = "../resources/" #public data (can be on github)
    
    mdasi_file = data_dir + "MDASI_72021.xlsx"
    camprt_dir = data_dir + "CAMPRT_Centroids/"
    
    organ_info_json = resource_dir + "OrganInfo.json"
    symptom_info_json = resource_dir + "symptoms.json"

In [3]:
def load_spatial_files(root = None):
    #reads in the files for tumor centroids and ROI-tumor distances
    #returns {'distances': [file_names],'doses': [file_names]}
    root = Const.camprt_dir if root is None else root
    file_sort = lambda x: sorted(x, key =
                                         lambda file:
                                             max([int(x) for x in findall("[0-9]+", file)])
                                    )
    try:
        distance_files = file_sort(glob(Const.camprt_dir + '**/*distances.csv'))
    except:
        distance_files = []
    try:
        dose_files = file_sort(glob(Const.camprt_dir + '**/*centroid*.csv'))
    except: 
        dose_files = []
    return {'distances': distance_files, 'doses': dose_files}

spatial_files = load_spatial_files()
spatial_files

{'distances': ['../data/CAMPRT_Centroids/3/3_distances.csv',
  '../data/CAMPRT_Centroids/4/4_distances.csv',
  '../data/CAMPRT_Centroids/10/10_distances.csv',
  '../data/CAMPRT_Centroids/11/11_distances.csv',
  '../data/CAMPRT_Centroids/17/17_distances.csv',
  '../data/CAMPRT_Centroids/27/27_distances.csv',
  '../data/CAMPRT_Centroids/29/29_distances.csv',
  '../data/CAMPRT_Centroids/31/31_distances.csv',
  '../data/CAMPRT_Centroids/33/33_distances.csv',
  '../data/CAMPRT_Centroids/34/34_distances.csv',
  '../data/CAMPRT_Centroids/35/35_distances.csv',
  '../data/CAMPRT_Centroids/36/36_distances.csv',
  '../data/CAMPRT_Centroids/37/37_distances.csv',
  '../data/CAMPRT_Centroids/39/39_distances.csv',
  '../data/CAMPRT_Centroids/41/41_distances.csv',
  '../data/CAMPRT_Centroids/46/46_distances.csv',
  '../data/CAMPRT_Centroids/49/49_distances.csv',
  '../data/CAMPRT_Centroids/100/100_distances.csv',
  '../data/CAMPRT_Centroids/101/101_distances.csv',
  '../data/CAMPRT_Centroids/102/102_d

In [None]:
class OrganData():
    
    additional_renames = {
        'Lt_Ant_Digastric_M': 'Musc_Digastric_LA',
        'Rt_Ant_Digastric_M': 'Musc_Digastric_RA',
    }
    
    #should map columns to the ones in the lists below
    file_header_renames = {
        'x coordinate': 'x',
        'y coordinate': 'y',
        'z coordinate': 'z',
        'ROI': 'roi',
        'Structure Volume': 'volume',
        'Min Value': 'min_dose',
        'Max Value': 'max_dose',
        'Mean Value': 'mean_dose'
    }
    
    roi_cols = ['Reference ROI','Target ROI']
    roi_dist_col = 'Eucledian Distance (mm)'
    
    centroid_roi_col = 'roi'
    volume_col = 'volume'
    mean_dose_col = 'mean_dose'
    centroid_cols = ['x','y','z']

    def __init__(self, organ_info_json = None):
        organ_info_json = Const.organ_info_json if organ_info_json is None else organ_info_json
        with open(Const.organ_info_json) as f:
            self.organ_dict = json.load(f)
        self.organ_list = self.get_organ_list()
        self.num_organs = len(self.organ_list)
    
    def get_organ_list(self, skip_gtv = False):
        olist = []
        for organ,odata in self.organ_dict.items():
            if skip_gtv and odata['tissue_type'] == 'gtv':
                continue
            pmods = odata['positional_modifiers']
            for pm in pmods:
                olist.append(organ+pm)
            if len(pmods) < 1:
                olist.append(organ)
        return sorted(olist)
            
    def oar_rename_dict(self):
        #gives a dict for renaming organs in the old CAMPRt cohort with the new organ names
        rename_dict = OrganData.additional_renames
        for organ,odata in self.organ_dict.items():
            for alt_name in odata['alt_names']:
                pm = odata['positional_modifiers']
                if len(pm) < 1:
                    rename_dict[alt_name] = organ
                else:
                    for modifier in pm:
                        alt_key = self.standardize_position_modifier(alt_name, modifier)
                        rename_dict[alt_key] = organ + modifier
        return rename_dict
    
    def standardize_position_modifier(self, old_basename, new_modifier):
        #just reconciling the way the positions are added between cohorts
        new_name = old_basename
        lmod = new_modifier.lower()
        if('_r' in lmod):
            new_name = 'Rt_' + new_name
        elif('_l' in lmod):
            new_name = 'Lt_' + new_name
        elif('rt_' in lmod):
            new_name = new_name + '_R'
        elif('lt_' in lmod):
            new_name = new_name + '_L'
        return new_name
    
    def reconcile_organ_names(self, organ_dist_df, columns = None):
        #basically tries to standardize organ names accross datasets
        if type(columns) != type(None):
            organ_dist_df = organ_dist_df[columns]
        organ_dist_df.replace(to_replace = r'_*GTV.*N', value = '_GTVn', regex = True, inplace = True)
        return organ_dist_df.replace(self.oar_rename_dict())
    
    def reconcile_cohort_columns(self, organ_df):
        #placeholder
        return organ_df.rename(OrganData.file_header_renames,axis=1)
    
    
    def read_spatial_file(self, file):
        df = pd.read_csv(file)
        df = self.reconcile_organ_names(df)
        df = self.reconcile_cohort_columns(df)
        return df

    def format_patient_distances(self, pdist_file):
        #read the file with the centroid info, and format it for the data
        #currently outputs a dict of {(organ1, organ2): distance} where organ1, organ2 are sorted alphaetically
        dist_df = self.read_spatial_file(pdist_file)
    
        dist_df.loc[:,'organ1'] = dist_df.loc[:,OrganData.roi_cols].apply(lambda x: sorted(x)[1], axis=1)
        dist_df.loc[:,'organ2'] = dist_df.loc[:,OrganData.roi_cols].apply(lambda x: sorted(x)[0], axis=1)
        dist_df = dist_df.set_index(['organ1','organ2']).sort_index(kind='mergesort') #I just sort everthing alphabetically, may bug out otherwise idk
        dist_df = dist_df.loc[:,OrganData.roi_dist_col]
        return dist_df.reset_index()

#     def gen_tumor_dist_matrix(self, dist_dict, default_val = 0):
#         dist_matrix = np.empty((self.num_organs, self.num_organs))
#         for row in range(0, od.num_organs):
#             for col in range(row + 1, od.num_organs):
#                 o1 = od.organ_list[row]
#                 o2 = od.organ_list[col]
#                 try:
#                     dist_matrix[row,col] = dist_dict.get((o1,o2),default_val)
#                 except:
#                     print('error reading', (o1,o2))
#         dist_matrix += dist_matrix.transpose()
#         return dist_matrix
    
    def process_distance_file(self,file, default_value = np.nan):
        #reads a file, returns a dict of dstances 
        dist_df = self.format_patient_distances(file)
        dist_dict = {}
        for o1 in self.organ_list:
            oentry = {}
            for o2 in self.organ_list:
                if(o1 == o2):
                    continue
                match = dist_df[(dist_df.organ1 == o1) & (dist_df.organ2 == o2) | (dist_df.organ1 == o2) & (dist_df.organ2 == o1)]
                if match.shape[0] > 0:
                    tdist = match[OrganData.roi_dist_col].values
                    assert(len(tdist < 2))
                    tdist = tdist[0]
                else:
                    tdist = default_value
                oentry[o2] = tdist
            dist_dict[o1] = oentry
        dist_dict
        return dist_dict
    
    def process_dose_file(self, dose_file):
        dose_df = self.read_spatial_file(dose_file)
        #I'm pretty sure this preserves order (will be xyz), but not 100% if there are bugs laters
        dose_df.loc[:,'centroids'] = dose_df[OrganData.centroid_cols].apply(lambda x: tuple(x), axis=1)
        dose_df = dose_df.drop(OrganData.centroid_cols,axis=1)
        
        dose_df = dose_df.set_index(OrganData.centroid_roi_col).loc[self.organ_list].sort_index()
        df = dose_df.loc[:,['centroids',OrganData.volume_col, OrganData.mean_dose_col]]
        return df.to_dict(orient='index')
    
    def process_patient(self, dist_file,dose_file):
        dist_dict = self.process_distance_file(dist_file)
        dose_dict = self.process_dose_file(dose_file)
        merged_dict = {}
        for organ in self.organ_list:
            dists = dist_df[organ]
            doses = dose_df[organ]
            if(np.isnan(doses['volume'])):
                continue
            entry = {k:v for k,v in doses.items()}
            entry['distances'] = dists
            merged_dict[organ] = entry
        return merged_dict


od = OrganData()
patients = []
for i in range(len(spatial_files['distances'])):
    dist_file = spatial_files['distances'][i]
    dose_file = spatial_files['doses'][i]
    p_entry = od.process_patient(dist_file, dose_file)
    patients.append(p_entry)
patients

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


In [None]:
dist_df

In [None]:
sorted(['_g','bone','1'])