In [162]:
%reload_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
from Constants import Const
import Utils
import re
import glob
import SpatialPreprocessing as spatial
import SymptomPreprocessing as symp
from Levenshtein import distance as levenshtein_distance

In [156]:
class SpellChecker():
    
    def __init__(self, keywords, aliases, max_edit_distance = .15,normalize_score = True):
        self.keywords = keywords #list
        self.keywordset = set(keywords)
        self.aliases = aliases #dict 
        self.normalize_score = normalize_score
        self.positional_pairs = [('Lt','Rt'),('L_','R_'),('_L','_R')]
        self.max_edit_distance = max_edit_distance
            
    def word_distance(self,word1,word2):
        dist = 0
        #add an extra penalty if one is right and one is left
        for (x,y) in self.positional_pairs:
            if x in word1 and y in word2 or y in word1 and x in word2:
                dist += len(word1)
                break
        clean = lambda w: w.strip().lower().replace("_","")
        dist += levenshtein_distance(clean(word1),clean(word2))
        if self.normalize_score:
            dist = dist/max(len(str(word1)),len(str(word2)))
        return dist
        
    def best_spell_match(self,name,words):
        #compare a word with a list of words
        #get the closest word to source word based on edit distance
        best_match = None
        best_dist = np.inf
        for word in words:
            ld = self.word_distance(name,word)
            if ld < best_dist:
                best_dist = ld
                best_match = word
                if ld <= 0:
                    break
        return best_match, best_dist
    
    def spellcheck_df(self, df, cols=None,unique = True):
        df = df.copy()
        rename_dict = {}
        if cols is None:
            cols = list(df.columns)
            
        all_renames = {}
        for col in cols:
            df[col] = df[col].apply(lambda x: self.aliases.get(x.lower(),x))
            in_col = np.unique(df[col].values.astype('str'))
            matchwords = [i for i in self.keywords if i not in in_col]
            col_words = [w for w in in_col if w not in self.keywordset]
            if len(col_words) < 1:
                break
            rename_dict = {}
            for cword in col_words:
                match,dist = self.best_spell_match(cword,matchwords)
                if dist < self.max_edit_distance:
                    if match != cword:
                        rename_dict[cword] = match
                        all_renames[cword] = match
                else:
                    aliasmatch, alias_dist = self.best_spell_match(cword, list(self.aliases.keys()))
                    if alias_dist < self.max_edit_distance:
                        target = self.aliases[aliasmatch]
                        rename_dict[cword] = target
                        all_renames[cword] = target
            df[col] = df[col].apply(lambda x: rename_dict.get(x,x))
        return df,all_renames
    
class MdasiOrganData():
       
    organ_rename_dict = {
        'cricoid': 'Cricoid_cartilage',
         'cricopharyngeus': 'Cricopharyngeal_Muscle',
         'esophagus_u': 'Esophagus',
         'oral_cavity': 'Extended_Oral_Cavity',
         'musc_geniogloss': 'Genioglossus_M',
         'hardpalate': 'Hard_Palate',
         'bone_hyoid': 'Hyoid_bone',
         'musc_constrict_i': 'IPC',
         'lips_lower': 'Lower_Lip',
         'lips_upper': 'Upper_Lip',
         'musc_constrict_m': 'MPC',
         'musc_mgh_complex': 'Mylogeniohyoid_M',
         'musc_mghcomplex': 'Mylogeniohyoid_M',
         'palate_soft': 'Soft_Palate',
         'musc_constrict_s': 'SPC',
         'spinalcord_cerv': 'Spinal_Cord',
         'larynx_sg': 'Supraglottic_Larynx',
         'cartlg_thyroid': 'Thyroid_cartilage',
         'brachial_plex_r': 'Rt_Brachial_Plexus',
         'brachial_plex_l': 'Lt_Brachial_Plexus',
         'pterygoid_lat_r': 'Rt_Lateral_Pterygoid_M',
         'pterygoid_lat_l': 'Lt_Lateral_Pterygoid_M',
         'musc_masseter_r': 'Rt_Masseter_M',
         'musc_masseter_l': 'Lt_Masseter_M',
         'bone_mastoid_r': 'Rt_Mastoid',
         'bone_mastoid_l': 'Lt_Mastoid',
         'pterygoid_med_r': 'Rt_Medial_Pterygoid_M',
         'pterygoid_med_l': 'Lt_Medial_Pterygoid_M',
         'parotid_r': 'Rt_Parotid_Gland',
         'parotid_l': 'Lt_Parotid_Gland',
         'musc_sclmast_r': 'Rt_Sternocleidomastoid_M',
         'musc_sclmast_l': 'Lt_Sternocleidomastoid_M',
         'glnd_submand_r': 'Rt_Submandibular_Gland',
         'glnd_submand_l': 'Lt_Submandibular_Gland',
         'musc_digastric_ra': 'Rt_Ant_Digastric_M',
         'musc_digastric_la': 'Lt_Ant_Digastric_M',
        'musc_digastric_rp': 'Rt_Post_Digastric_M',
         'musc_digastric_lp': 'Lt_Post_Digastric_M',
        'esophagus_u': 'Esophagus',
        'hardpalate':'Hard_Palate',
    }
    
    #probably depricated
    additional_renames = {
        'Lt_Ant_Digastric_M': 'Musc_Digastric_LA',
        'Rt_Ant_Digastric_M': 'Musc_Digastric_RA',
    }
    
    #should map columns to the ones in the lists below
    #these are alo used for spellchecking
    file_header_renames = {
        'x coordinate': 'x',
        'y coordinate': 'y',
        'z coordinate': 'z',
        'ROI': 'roi',
        'Structure Volume': 'volume',
        'Volume': 'volume',
        'Min Value': 'min_dose',
        'Max Value': 'max_dose',
        'Max': 'max_dose',
        'Min': 'min_dose',
        'Mean': 'mean_dose',
        'Mean Value': 'mean_dose',
        'Mean doses': 'mean_dose',
        'Minimum': 'min_dose',
        'Maximum': 'max_dose',
        'mean': 'mean_dose',
        'minGy': 'min_dose',
        'maxGy': 'max_dose',
    }
    
    #header names for the files
    
    roi_cols = ['Reference ROI','Target ROI']
    roi_dist_col = 'Eucledian Distance (mm)'
    
    centroid_roi_col = 'roi'
    volume_col = 'volume'
    mean_dose_col = 'mean_dose'
    centroid_cols = ['x','y','z']
    
    def __init__(self, 
                 root = None,
                 organ_info_json = None,
                 data_type = np.float16,
                 spellcheck_organs = True,
                 mdasi_dvh_path = None,
                ):
        if mdasi_dvh_path is None:
            mdasi_dvh_path = Const.data_dir + 'Cohort_SMART2_530pts_(486pts).xlsx'
        self.mdasi_dvh_path = mdasi_dvh_path
        self.data_type = data_type
        self.root = Const.mdasi_centroid_dir if root is None else root
        self.organ_list = self.get_organ_list()
        self.num_organs = len(self.organ_list)
        #see if we run a spellcheck on the data
        #robust to typos, but slow
        self.spellcheck_organs = spellcheck_organs
        self.spellchecker =  SpellChecker(Const.organ_list, 
                          MdasiOrganData.organ_rename_dict)
        
    def load_spatial_files(self):
        spatial_files = spatial.load_spatial_files(root=self.root)
        #temp
#         spatial_files = {k:v for k,v in spatial_files.items() if k < 100}
        return spatial_files
        
    def get_organ_list(self, skip_gtv = True):
        return Const.organ_list
    
    def rename_gtvs(self,gtvlist):
        #rename gtvs for a single patient
        new_dict = {}
        sorted_entries = sorted(gtvlist, key = lambda x: -x[1].get(MdasiOrganData.volume_col,0))
        currname = 'GTVp'
        node_num = 0
        for (gtvname, gtv) in sorted_entries:
            #stuff to error check nan could go here
            try:
                volume = float(gtv.get(MdasiOrganData.volume_col,np.nan))
                temp_name = currname
                if(node_num > 1):
                    temp_name = temp_name + str(node_num)
                new_dict[temp_name] = gtv
                if(currname == 'GTVp'):
                    currname = 'GTVn'
                node_num += 1
            except Exception as e:
                print('error reading gtv', gtvname, gtv)
                print(e)
        return new_dict

    def is_valid_patient(self,p_entry):
        #code for cleaning up patients that are just no good
        has_gtv = False
        for oar in p_entry.keys():
            if 'GTV' in oar:
                has_gtv = True
                return has_gtv #delete this line if I want more stuff here
            else:
                has_gtv = False
        return has_gtv
    
    def best_spell_match(self,name,words):
        #compare a word with a list of words
        #get the closest word to source word based on edit distance
        best_match = None
        best_dist = np.inf
        ldist = lambda x,y: levenshtein_distance(x.strip().lower(),y.strip.lower())
        for word in words:
            #Lt and Rt changes keep getting weird
            if 'Lt' in name and 'Rt' in word or 'Rt' in name and 'Lt' in word:
                continue
            ld = levenshtein_distance(name,word)
            if ld < best_dist:
                best_dist = ld
                best_match = word
                if ld <= 0:
                    break
        return best_match, best_dist
    
    def process_cohort_spatial_dict(self, spatial_files):
        patients = {}
        invalid_ids = []
        for pid, entry in spatial_files.items():
            try:
                p_entry = self.process_patient(entry['distances'], entry['doses'])
                if(self.is_valid_patient(p_entry)):
                    patients[pid] = p_entry
                else:
                    invalid_ids.append(pid)
            except Exception as e:
                print('error reading patient', pid)
                print(e)
        if len(invalid_ids) > 1:
            print("invalid patients", invalid_ids)
        return patients
#         return {'organs': self.organ_list, 'patients': patients}
    
    def process_patient(self, dist_file,dose_file):
        dose_dict= self.process_dose_file(dose_file)
        merged_dict = self.process_distance_file(dist_file, dose_dict)
        merged_dict= self.format_gtvs(merged_dict)
        return merged_dict
    
    def reconcile_organ_names(self, organ_dist_df, columns = None):
        #basically tries to standardize organ names accross datasets
        if type(columns) != type(None):
            organ_dist_df = organ_dist_df[columns]
        #check that this works idk
        organ_dist_df.replace(to_replace = r'_*GTV.*N', value = '_GTVn', regex = True, inplace = True)
        #check organs if we're looking at at a centroid file
        if self.spellcheck_organs:
            cols_to_check = self.roi_cols
            if self.centroid_roi_col in organ_dist_df.columns:
                cols_to_check = [self.centroid_roi_col]
            organ_dist_df,renames = self.spellchecker.spellcheck_df(organ_dist_df,
                                                            cols=cols_to_check)
            if len(renames) > 0:
                print('renames',renames)
        return organ_dist_df#.replace(self.oar_rename_dict())
    
    def reconcile_cohort_columns(self, organ_df):
        #placeholder
        organ_df = organ_df.rename(MdasiOrganData.file_header_renames,axis=1)
        return organ_df
    
    def read_spatial_file(self, file):
        df = pd.read_csv(file)
        df = self.reconcile_cohort_columns(df)
        df = self.reconcile_organ_names(df)
        return df
    
    
    def format_gtvs(self, mdict):
        gtvs = [(k,v) for k,v in mdict.items() if 'GTV' in k]
        if len(gtvs) < 1:
            return mdict
        oars = self.rename_gtvs(gtvs)
        for oname, odata in mdict.items():
            if 'GTV' in oname:
                continue
            oars[oname] = odata
        return oars
    
    def process_dose_file(self, dose_file, default_value = np.nan):
        dose_df = self.read_spatial_file(dose_file)
        dose_df = dose_df.set_index(MdasiOrganData.centroid_roi_col).sort_index()
        organs = sorted(dose_df.index.values)
        dose_dict = {}
        for organ in organs:
            #filter out extra organs idk
            if ('GTV' not in organ) and organ not in self.organ_list:
                continue
            entry = {}
            def getfield(col):
                try:
                    return dose_df.loc[organ, col]
                except:
                    return default_value
            entry['centroids'] = np.array([getfield(v) for v in MdasiOrganData.centroid_cols])
            dose_dict[organ] = entry
        return dose_dict
    
    def format_patient_distances(self, pdist_file):
        #read the file with the centroid info, and format it for the data
        #currently outputs a dict of {(organ1, organ2): distance} where organ1, organ2 are sorted alphaetically
        dist_df = self.read_spatial_file(pdist_file)
        dist_df = dist_df.reindex(MdasiOrganData.roi_cols + [MdasiOrganData.roi_dist_col],axis=1)
        dist_df = dist_df.dropna()
        subdf = dist_df.loc[:, MdasiOrganData.roi_cols]
        dist_df.loc[:,'organ1'] = subdf.apply(lambda x: sorted(x)[1], axis=1)
        dist_df.loc[:,'organ2'] = subdf.apply(lambda x: sorted(x)[0], axis=1)
        dist_df = dist_df.set_index(['organ1','organ2']).sort_index(kind='mergesort') #I just sort everthing alphabetically, may bug out otherwise idk
        dist_df = dist_df.loc[:,MdasiOrganData.roi_dist_col]
        return dist_df.reset_index()
    
    def process_distance_file(self, file, centroid_dict, default_value = np.nan):
        #reads a file, returns a df with organ1, organ2, distance (sorted)
        dist_df = self.format_patient_distances(file)
        rois = set(centroid_dict.keys())
        oars = sorted(set(self.organ_list).intersection(rois))
        gtvs = [r for r in rois if 'GTV' in r]
        
        merged_dict = {}
        organs = set(list(oars) + gtvs)
        #we want the entrys to be all valid organs or gtvs, but
        #the distance array to be in the shape of the predefined list
        for o1 in organs: 
            oentry = np.zeros((self.num_organs,)).astype(self.data_type)
            if(o1 not in organs):
                oentry = oentry.fill(default_value)
            else:
                for pos, o2 in enumerate(self.organ_list):
                    if o1 == o2:
                        continue
                    if o2 not in oars:
                        tdist = default_value
                    else:
                        match = dist_df[(dist_df.organ1 == o1) & (dist_df.organ2 == o2) | (dist_df.organ1 == o2) & (dist_df.organ2 == o1)]
                        if match.shape[0] > 0:
                            tdist = match[MdasiOrganData.roi_dist_col].values
                            assert(len(tdist) < 2)
                            tdist = tdist[0]
                        else:
                            tdist = default_value
                    oentry[pos] = tdist
            mdict_entry = centroid_dict[o1]
            mdict_entry['distances'] = oentry
            merged_dict[o1] = mdict_entry
        return merged_dict
    
    def filter_valid_patients(self,df):
        return df[~df.id.isnull()]
    
    def clean_dvh_df(self, df, organ_rename_dict = None):
        df = df.rename(MdasiOrganData.file_header_renames,axis=1)
        df = df[df.DicomType == "ORGAN"]
        
        #this maps words to words in the rename dict
        #somewhat weird because it can inverse the order but it re-fixes itself?
        #don't know how else to prevent bugs
        print('spellchecking...')
        spellchecked_df, _= self.spellchecker.spellcheck_df(df,['Structure'])
        print('renaming things')
        df['ROI'] = spellchecked_df['Structure']
        df = df.drop(["DicomType"],axis=1)
        df = df.reset_index()
        df = df[df.ROI.isin(self.organ_list)] #only keep organs we car about
        df = df.drop_duplicates(subset=['id','ROI','volume'])
        df = df[df.mean_dose != 'error'] #idk what this is from
        for col in df.columns:
            if 'dose' in col.lower() or 'volume' in col.lower():
                df[col] = df[col].astype(float)
        print('filtering pateints')
        df = self.filter_valid_patients(df.reset_index()) 
#         print('adding nan values for missing organs')
#         df = self.add_missing_organs(df) 
        print('adding histograms')
        hist_cols = [c for c in df.columns if (re.match('[DV]\d+',c) is not None)]
        df[hist_cols] = df[hist_cols].astype('float16')
        if 'index' in df.columns:
            df = df.drop(['index'],axis=1)
        return df
    
    def add_patient_organs(self,pid,patient_df):
        pdf = patient_df.copy()
        rois = np.unique(patient_df.ROI.values)
        for organ in self.organ_list:
            if organ not in rois:
                entry = pd.Series([pid,'missing',organ],index=['id','Structure','ROI'])
                pdf = pdf.append(entry,ignore_index=True)
        return pdf
    
    def add_missing_organs(self,df):
        dfs = []
        for pid,subdf in df.groupby('id'):
            subdf = self.add_patient_organs(pid,subdf).set_index("ROI")
            subdf = subdf.loc[self.organ_list]
            subdf = subdf.reset_index()
            dfs.append(subdf)
        return pd.concat(dfs)
    
    def load_mdasi_doses(self,path=None):
        if path is None:
            path = self.mdasi_dvh_path
        if 'xlsx' in path:
            dvh_df = pd.read_excel(path,index_col=0)
        else: 
            dvh_df = pd.read_csv(path,index_col=0)
        return self.clean_dvh_df(dvh_df)
    
    def get_defaults(self, df,cols=None,default_value = np.nan):
        if cols is None:
            cols = list(df.columns)
        defaults = {}
        for col in cols:
            entry = df[col].values[0]
            if Utils.iterable(entry):
                d = [np.nan for i in entry]
            else:
                d = np.nan
            if type(entry) == np.ndarray:
                d = np.array(d)
            defaults[col] = d
        return defaults
    
    def add_doses(self,pdict):
        ddf = self.load_mdasi_doses()
        pdict2 = {k:v for k,v in pdict.items()}
        to_copy = [c for c in ddf.columns if c not in ['ROI',"Structure",'id']]
        defaults = self.get_defaults(ddf,cols=to_copy)
        for pid,odata in pdict2.items():
            subddf = ddf[ddf.id.astype(int) == int(pid)]
            for oname, oentry in odata.items():
#                 if 'GTV' in oname:
#                     continue
                match = subddf[subddf.ROI.apply(lambda x: x.lower()) == oname.lower()]
                ovol = default_value
                if match.shape[0] > 0:
                    vals = match[to_copy].to_dict(orient='records')[0]
                else:
                    vals = {k:v for k,v in defaults.items()}
                for k,v in vals.items():
                    oentry[k] = v
        return pdict2
    
    def process(self,add_doses=True):
        files = self.load_spatial_files()
        pdict = self.process_cohort_spatial_dict(files)
        if add_doses:
            pdict = self.add_doses(pdict)
        return pdict
    
    
md = MdasiOrganData()
pdict = md.process()
# ddf = md.load_mdasi_doses()
# ddf
pdict

invalid patients [559, 507]
spellchecking...
renaming things
filtering pateints
adding histograms


{1: {'GTVp': {'centroids': array([246.363 , 213.8985, 153.3393]),
   'distances': array([61.2   , 32.06  , 50.72  , 37.7   , 45.22  , 40.22  , 30.4   ,
          16.58  , 48.28  , 16.    , 20.    , 27.44  , 58.    , 29.52  ,
          25.45  , 50.78  , 35.34  , 37.38  , 72.5   , 50.7   , 21.58  ,
          44.78  , 63.9   , 55.06  , -2.    ,  2.184 ,  6.19  ,  0.9766,
          17.81  , -7.293 ,  6.    ,  2.43  , 14.96  ,  2.762 , -7.293 ,
          26.73  , 57.06  , 53.53  , 64.25  , 32.97  ], dtype=float16),
   'volume': nan,
   'mean_dose': nan,
   'min_dose': nan,
   'max_dose': nan,
   'V5': nan,
   'V10': nan,
   'V15': nan,
   'V20': nan,
   'V25': nan,
   'V30': nan,
   'V35': nan,
   'V40': nan,
   'V45': nan,
   'V50': nan,
   'V55': nan,
   'V60': nan,
   'V65': nan,
   'V70': nan,
   'V75': nan,
   'V80': nan,
   'D2': nan,
   'D5': nan,
   'D10': nan,
   'D15': nan,
   'D20': nan,
   'D25': nan,
   'D30': nan,
   'D35': nan,
   'D40': nan,
   'D45': nan,
   'D50': nan,
   

In [None]:
def pdict_to_df(pdict, olist= None, values=None):
    if values is None:
        values = pdict.columns
    

In [177]:
symp_df = symp.load_mdasi()
symp_df = symp.impute_and_group(symp_df)
symp_df

before drop count 823
after drop count 535


  val_df[name] = timestep


error (%) 0.7424412531907885
[('severe_6wk_symptoms', 0.009345794392523364), ('severe_late_symptoms', 0.014953271028037384), ('moderate_6wk_symptoms', 0.052336448598130844), ('moderate_late_symptoms', 0.04672897196261682), ('mild_6wk_symptoms', 0.19626168224299065), ('mild_late_symptoms', 0.17009345794392525)]


Unnamed: 0,followup_days,id,is_male,age,t_stage,nd,os,subsite,n_stage,treatment2,...,symptomdomain_mucosal,symptomdomain_speech,symptomdomain_pain,symptomdomain_general,severe_6wk_symptoms,severe_late_symptoms,moderate_6wk_symptoms,moderate_late_symptoms,mild_6wk_symptoms,mild_late_symptoms
0,203.0,1,True,72.0,t1,0.0,0,BOT,n2b,Cr,...,"[0.0, 0.0, 0.5, 0.0, 1.0, 1.0, 2.5, 6.0, 1.5, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 5.0, 0.0, ...","[1.0, 1.0, 0.0, 1.0, 1.0, 2.0, 5.0, 7.0, 3.0, ...","[0.5, 0.3, 0.3, 0.1, 0.6, 0.2, 1.8, 2.3, 1.9, ...",False,False,False,False,False,False
1,1201.0,7,True,48.0,t0,0.0,1,NOS,n2b,Cr,...,"[0.0, 0.5, 0.5, 2.0, 3.5, 0.5, 2.5, 3.0, 1.0, ...","[0.0, 1.0, 1.0, 1.0, 2.0, 2.0, 4.0, 2.0, 1.0, ...","[0.0, 0.0, 1.0, 4.0, 3.0, 3.0, 8.0, 5.0, 5.0, ...","[0.1, 1.3, 1.5, 2.5, 2.6, 1.5, 2.5, 1.7, 1.3, ...",False,False,False,False,False,False
2,1619.0,8,True,61.0,t2,0.0,1,BOT,n2c,Cr,...,"[0.5, 0.5, 1.0, 1.5, 2.5, 1.5, 3.0, 4.5, 2.0, ...","[0.0, 2.0, 0.0, 0.0, 1.0, 0.0, 0.0, 2.0, 0.0, ...","[4.0, 7.0, 3.0, 1.0, 2.0, 1.0, 2.0, 6.0, 0.0, ...","[1.1, 1.9, 1.2, 1.5, 1.8, 1.1, 1.8, 2.3, 2.8, ...",False,False,False,False,False,False
3,1884.0,9,True,50.0,t1,1.0,1,Tonsil,n2a,SCr,...,"[0.0, 0.5, 1.0, 2.0, 2.5, 2.5, 3.5, 5.0, 1.5, ...","[0.0, 2.0, 1.0, 2.0, 1.0, 1.0, 2.0, 2.0, 2.0, ...","[0.0, 1.0, 1.0, 1.0, 2.0, 4.0, 4.0, 7.0, 2.0, ...","[0.0, 0.2, 0.3, 0.6, 1.1, 1.1, 1.5, 1.7, 0.5, ...",False,False,False,False,False,False
4,476.0,10,True,70.0,t4,0.0,0,BOT,n2b,ICr,...,"[0.0, 1.0, 0.5, 1.5, 4.5, 3.0, 5.0, 4.5, 1.5, ...","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 1.0, 1.0, 2.0, 6.0, 7.0, 0.0, ...","[0.8, 0.7, 0.7, 1.5, 2.0, 2.0, 3.9, 3.6, 0.5, ...",False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
530,147.0,811,True,54.0,t2,,1,Tonsil,n1,,...,"[0.0, 0.0, 0.5, 1.5, 2.0, 2.5, 2.5, 6.0, 2.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","[1.0, 0.0, 1.0, 2.0, 1.0, 3.0, 3.0, 4.0, 0.0, ...","[0.6, 0.5, 0.3, 0.5, 0.6, 0.6, 1.2, 2.1, 0.2, ...",False,False,False,False,False,False
531,39.0,813,True,68.0,,,1,NOS,,,...,"[0.0, 0.0, 1.5, 1.0, 3.0, 3.5, 4.0, 4.5, 0.5, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 1.0, 1.0, 4.0, 4.0, 3.0, 4.0, 0.0, ...","[0.2, 0.0, 0.1, 0.5, 0.1, 0.7, 1.1, 1.2, 0.1, ...",False,False,False,False,False,False
532,,816,True,78.0,tx,,,BOT,n1,,...,"[1.5, 0.0, 1.0, 1.5, 1.0, 1.5, 2.0, 1.0, 0.0, ...","[0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, ...","[1.0, 0.0, 0.0, 2.0, 3.0, 4.0, 3.0, 2.0, 1.0, ...","[0.7, 0.7, 0.1, 0.8, 0.9, 1.5, 0.4, 1.3, 0.0, ...",False,False,False,False,False,False
533,138.0,819,False,52.0,t1,,1,BOT,n1,,...,"[0.0, 0.5, 0.5, 1.0, 3.0, 6.0, 3.5, 3.5, 1.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.0, 1.0, 0.0, ...","[0.0, 1.0, 1.0, 2.0, 4.0, 6.0, 4.0, 4.0, 0.0, ...","[0.0, 0.3, 0.3, 0.4, 1.0, 2.5, 1.2, 1.5, 0.1, ...",False,False,False,False,False,False


{'followup_days': nan,
 'id': nan,
 'is_male': nan,
 'age': nan,
 't_stage': [nan, nan],
 'nd': nan,
 'os': nan,
 'subsite': [nan, nan, nan],
 'n_stage': [nan, nan, nan],
 'treatment2': [nan, nan],
 'rt': nan,
 'performance_score': nan,
 'concurrent': nan,
 'duration': nan,
 'is_ajcc_8th_edition': nan,
 'hpv': nan,
 'rt_type': nan,
 'ic': nan,
 'chemotherapy': [nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan],
 'typetreatment': [nan, nan],
 'dates': [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
 'symptoms_pain': [nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan],
 'symptoms_fatigue': [nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan],
 'symptoms_nausea': [nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan,
  nan],
 'symptoms_sleep': [nan,
  nan,
  nan