In [1]:
%load_ext autoreload
%autoreload 2

In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import  glob
from Constants import Const
import joblib
import matplotlib as mpl
import os
import pickle 
import simplejson
import ProcessMdasi as pm

In [5]:
def load_distances(full=False):
    dfile = Const.small_dist_json if not full else Const.full_dist_json
    with open(dfile,'r') as f:
        dists = simplejson.load(f)
    return dists
len(load_distances()['distances'])

297

In [7]:
def get_id_map(file = '../data/key_map.xlsx'):
    df = pd.read_excel('../data/key_map.xlsx').drop('Unnamed: 0',axis=1)
    df['mdasi_id'] = df['STIEFEL'].apply(lambda x: int(x.replace("STIEFEL_",'')))
    df = df[['mdasi_id','ID']]
    return df.set_index('ID').to_dict()['mdasi_id']

len(get_id_map())

305

In [9]:
def group_symptoms(mdasi,week_groups = [[0,1],[2,3,4,5,6,7],[13],[33]],names=['baseline','acute','6wk','6M'],drop_original=True):
    dates = mdasi.dates.iloc[0]
    dpositions =[ [dates.index(ww) for ww in w] for w in week_groups]
    symptoms = [c.replace('symptoms_','') for c in mdasi.columns if 'symptoms_' in c]
    new_cols = []
    mdasi = mdasi.copy()
    for name,indices in zip(names,dpositions):
        for symptom in symptoms:
            rating = mdasi['symptoms_'+symptom].apply(lambda x: np.max([np.nan_to_num(x[i],nan=-1) for i in indices]))
            cname = symptom + '_' + name
            mdasi[cname] = rating
        mdasi=mdasi.copy()
    if drop_original:
        mdasi = mdasi.drop(['symptoms_'+s for s in symptoms]+['dates'],axis=1)   
    return mdasi, symptoms, names

def load_mdasi(file = '../data/MDASI_0909201_surgery_updated.csv'):
    dframe = pd.read_csv(file)
    dframe = pm.format_mdasi_columns(dframe)
    dframe, symptoms, names = group_symptoms(dframe)
    return dframe.set_index('id'), symptoms, names

mdasi, symptoms, names = load_mdasi()
mdasi

Unnamed: 0_level_0,followup_days,is_male,rt,ic,subsite,t_stage,hpv,performance_score,concurrent,rt_type,...,constipation_6M,taste_6M,mucositis_6M,teeth_6M,activity_6M,mood_6M,work_6M,relations_6M,walking_6M,enjoy_6M
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,203.0,True,1.0,0.0,BOT,t1,1.0,0,1.0,3.0,...,0.0,1.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1084.0,True,1.0,0.0,Tonsil,t4,-1.0,0,1.0,5.0,...,2.0,5.0,7.0,3.0,3.0,4.0,2.0,2.0,0.0,4.0
3,857.0,True,1.0,0.0,Soft_palate,t3,-1.0,1,1.0,2.0,...,5.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1333.0,False,1.0,0.0,Tonsil,t2,1.0,1,1.0,3.0,...,0.0,10.0,0.0,3.0,0.0,7.0,0.0,0.0,0.0,7.0
5,331.0,True,1.0,,BOT,t2,0.0,0,1.0,5.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
819,138.0,False,1.0,0.0,BOT,t1,-1.0,0,,3.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
820,157.0,True,1.0,0.0,BOT,t4,-1.0,0,1.0,5.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
821,,False,0.0,,Tonsil,t1,-1.0,1,0.0,,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
822,,True,1.0,1.0,Tonsil,t3,-1.0,,1.0,3.0,...,8.0,0.0,0.0,6.0,0.0,0.0,10.0,0.0,10.0,10.0


In [12]:
def get_finished_pids(root=None):
    if root is None:
        root = '../data/'
    files = glob.glob(root + 'pclouds_*.json')
    pids = []
    for file in files:
        pid = file.replace( root+'pclouds_','').replace('.json','')
        if pid.isnumeric():
            pids.append(int(pid))
        else:
            print('bad pid',pid)
    return pids

def get_dicoms_mdasi_stuff(mdasi=None):
    #this will through an error if there is a missing value, which is future me's problem
    ids = get_finished_pids()
    id_map = get_id_map()
    indexes = [id_map.get(i) for i in ids]
    if mdasi is None:
        mdasi, symptoms, names = load_mdasi()
    mdasi = mdasi.loc[indexes]
    mdasi['dicom_id'] = ids
    mdasi = mdasi.reset_index().set_index('dicom_id')
    mdasi.index = mdasi.index.astype(int)
    return mdasi

mdasi = get_dicoms_mdasi_stuff(mdasi)
mdasi

Unnamed: 0_level_0,id,followup_days,is_male,rt,ic,subsite,t_stage,hpv,performance_score,concurrent,...,constipation_6M,taste_6M,mucositis_6M,teeth_6M,activity_6M,mood_6M,work_6M,relations_6M,walking_6M,enjoy_6M
dicom_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1054079696,525,319.0,True,1.0,0.0,Tonsil,t1,1.0,0,1.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1072572079,611,889.0,True,1.0,0.0,Tonsil,t2,1.0,1,0.0,...,0.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1079757401,372,644.0,True,1.0,0.0,BOT,t1,1.0,1,0.0,...,3.0,3.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
1087308891,166,1198.0,True,1.0,0.0,Tonsil,t1,1.0,0,0.0,...,1.0,2.0,0.0,1.0,2.0,2.0,1.0,2.0,2.0,2.0
1099927508,45,1432.0,True,1.0,0.0,Tonsil,t2,1.0,0,1.0,...,1.0,8.0,5.0,2.0,2.0,2.0,4.0,4.0,3.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9399080429,140,,True,,,Tonsil,t1,1.0,0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
9479197119,661,,False,0.0,0.0,BOT,t1,1.0,1,0.0,...,1.0,2.0,4.0,0.0,3.0,5.0,3.0,1.0,0.0,6.0
9626079921,117,1062.0,True,1.0,0.0,BOT,t2,1.0,0,1.0,...,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9643148771,183,744.0,False,1.0,,Tonsil,t1,1.0,0,1.0,...,0.0,0.0,2.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0


In [19]:
def np_converter(obj):
    #converts stuff to vanilla python  for json since it gives an error with np.int64 and arrays
    if isinstance(obj, np.integer):
        return int(obj)
    elif isinstance(obj, np.float):
        return round(float(obj),3)
    elif isinstance(obj, float):
        return round(float(obj),3)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    elif isinstance(obj, np.bool_):
        return bool(obj)
    elif isinstance(obj, datetime.datetime) or isinstance(obj, datetime.time):
        return obj.__str__()
    elif np.isnan(obj):
        return 0
    print('np_converter cant encode obj of type', obj,type(obj))
    return obj

def save_mdasi_df(md,symptoms,timepoints):
    mdict = md.fillna('Nan').to_dict(orient='index')

    data = {'data': mdict, 'symptoms': symptoms, 'timepoints': timepoints}
    with open(Const.processed_mdasi,'w') as f:
        simplejson.dump(data,f,default = np_converter)
save_mdasi_df(mdasi,symptoms,names)

In [139]:

def read_rt_struct(rtstruct,contour_dict = None,rename=True):
    #this should read an rtstruct file, clean the names
    #returns a dict of {roi: [pointcloud,pointclouds...]}
    #multiple pointclouds if there are different contours that are name varaints of a single organ (list gtv)
    
    #pass contour dict if there are mutliple rt struct files?
    if contour_dict is None:
        contour_dict = {}

    rseq_list = fu.get_element(rtstruct,'ROIContourSequence',[])
    roi_list = fu.get_element(rtstruct,'StructureSetROISequence',[])
    if len(rseq_list) < 1 or len(roi_list) < 1:
        return False
    assert(len(rseq_list) == len(roi_list))
    for rcseq,roi in zip(rseq_list,roi_list):
        try:
            name = roi.ROIName
            if rename:
                name = fu.fix_roi_name(name)
            number = roi.ROINumber
            if 'ContourSequence' in rcseq:
                cs = rcseq.ContourSequence
                #each contourSequence is at a different z-height, so Imma just merge them
                contours = [np.array(s.ContourData).reshape(-1,3) for s in rcseq.ContourSequence if len(s.ContourData) > 0]
                contours = np.vstack(contours)
                curr_entry = contour_dict.get(name,[])
                curr_entry.append(contours)
                contour_dict[name] = curr_entry
        except Exception as e:
            print('error in read_rt_struct',e)
    return contour_dict



def investigate_names(pids,only_gtv=False):
    #gets a dict of the roi names nad what they are after fixing to look for issues
    namedict = {}
    for pid in pids:
        pfiles,_ = fu.load_patient_folder(Const.unprocessed_dicoms,pid,file_types = ['RT Structure Set Storage'],as_dict=False)
        rt_structs = pfiles['RT Structure Set Storage']
        structs = [read_rt_struct(rts,rename=False) for rts in rt_structs]
        nlist = []
        for struct in structs:
            names = [(k,fu.fix_roi_name(k),[len(vv) for vv in v]) for k,v in struct.items()]
            for n in names:
                if (not only_gtv) or n[1] in ['gtv','gtvn','ctv','ptv','rtv']:
                    nlist.append(n)
        namedict[pid] = nlist
    return namedict

# these are the people with rtvs mislabeld as gtvs
# pids = [
#     2767317435,6060411302,2414841499,
#     2019523932,2279280705,4017119917,
#     5038138708,1337145443,1646439698,
#     3099145083
# ]
#these are people with no gtv
# pids =[2889102751,2894996073,2908060983,2939740989,2983776095,3035721150,3205005928]
#other
pids = [5038138708]
investigate_names(pids,
#                   only_gtv=True
                 )

{5038138708: [('Brainstem', 'brainstem', [1275]),
  ('VB_C1', 'vb_c1', [1080]),
  ('VB_C2', 'vb_c2', [1420]),
  ('VB_C3', 'vb_c3', [1045]),
  ('VB_C4', 'vb_c4', [1004]),
  ('VB_C5', 'vb_c5', [1121]),
  ('VB_C6', 'vb_c6', [1115]),
  ('VB_C7', 'vb_c7', [1260]),
  ('Cricoid', 'cricoid', [638]),
  ('Cricopharyngeus', 'cricopharyngeal_muscle', [262]),
  ('Esophagus_U', 'esophagus', [1115]),
  ('Oral_Cavity', 'oral_cavity', [3454]),
  ('Musc_Geniogloss', 'genioglossus_m', [1051]),
  ('Glottic_Area', 'glottis', [77]),
  ('Hardpalate', 'hard_palate', [204]),
  ('Bone_Hyoid', 'hyoid', [288]),
  ('Musc_Constrict_I', 'ipc', [300]),
  ('Larynx', 'larynx', [1390]),
  ('Lips_Lower', 'lips_lower', [665]),
  ('Musc_Digastric_LA', 'ant_digastric_l', [448]),
  ('Musc_Scalene_LA', 'ant_scalene_l', [588]),
  ('Lens_L', 'eye_l', [24]),
  ('Brachial_Plex_L', 'brachial_plex_l', [1019]),
  ('Musc_Buccinat_L', 'buccinator_l', [698]),
  ('Clavicle_L', 'clavicle_l', [1986]),
  ('A_Carotid_Int_L', 'a_carotid_int_

In [131]:
pids = fu.get_finished_pids()
has_rtv = set()
for pid in pids:
    pfiles,_ = fu.load_patient_folder(Const.unprocessed_dicoms,pid,file_types = ['RT Structure Set Storage'],as_dict=False)
    rt_structs = pfiles['RT Structure Set Storage']
    structs = [read_rt_struct(rts,rename=False) for rts in rt_structs]
    for struct in structs:
        names = list(struct.keys())
        for name in names:
            if 'rtv' in name:
                has_rtv.add(pid)
                continue
has_rtv

set()