In [1]:
import os
import pandas as pd
import numpy as np

<h2> Tratamiento de los datos </h2>

<h3>Carga de los datos y de las rutas de las imagenes</h3>

En primer lugar elaboraremos un dataframe elaborado con Pandas en el que incluiremos las rutas de las imagenes, su identificación, la clase de cancer, su edad, su tipo de operación si es que ha realizado alguna y su supervivencia.
Tambien serán eliminados los campos que no sean importantes como las identificaciones de otros años.


In [2]:
directory_path = 'G:\Dataset'
os.chdir(directory_path)

survival_path = os.path.join(directory_path, 'survival_info.csv')
survival_dataframe = pd.read_csv(survival_path, header=0, index_col='Brats20ID')
survival_dataframe.index.names = ['ID']

grade_dataframe = os.path.join(directory_path, 'name_mapping.csv')
grade_dataframe = pd.read_csv(grade_dataframe, header=0, index_col='BraTS_2020_subject_ID')
grade_dataframe.index.names = ['ID']

In [3]:
dataframe = grade_dataframe.join(survival_dataframe)
dataframe.drop(columns=['BraTS_2017_subject_ID', 'BraTS_2018_subject_ID', 'TCGA_TCIA_subject_ID', 'BraTS_2019_subject_ID'], inplace=True)
dataframe.head()
dataframe.describe(include = 'all')

Unnamed: 0,Grade,Age,Survival_days,Extent_of_Resection
count,369,236.0,236.0,129
unique,2,,218.0,2
top,HGG,,82.0,GTR
freq,293,,3.0,119
mean,,61.223203,,
std,,11.874114,,
min,,18.975,,
25%,,54.24425,,
50%,,61.471,,
75%,,69.2,,


In [4]:
#dataframe = dataframe.reindex(columns = ['Grade', 'Age', 'Survival_days', 'Extent_of_Resection', 't1', 't1ce', 't2', 'flair'])
dataframe.rename(columns={'Grade':'grade', 'Age':'age','Survival_days':'survival_days','Extent_of_Resection':'extent_of_resection'}, inplace=True)

In [5]:
def load_image_path(dataframe, directory_path):
    '''
    '''
    images=['_t1.nii', '_t1ce.nii', '_t2.nii', '_flair.nii']
    for ID in dataframe.index:
        dataframe.at[ID, 't1'] = os.path.join(directory_path, ID, ID+images[0])
        dataframe.at[ID, 't1ce'] = os.path.join(directory_path, ID, ID+images[1])
        dataframe.at[ID, 't2'] = os.path.join(directory_path, ID, ID+images[2])
        dataframe.at[ID, 'flair'] = os.path.join(directory_path, ID, ID+images[3])
    return dataframe

In [6]:
dataframe = load_image_path(dataframe, directory_path)
dataframe.head()

Unnamed: 0_level_0,grade,age,survival_days,extent_of_resection,t1,t1ce,t2,flair
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
BraTS20_Training_001,HGG,60.463,289,GTR,G:\Dataset\BraTS20_Training_001\BraTS20_Traini...,G:\Dataset\BraTS20_Training_001\BraTS20_Traini...,G:\Dataset\BraTS20_Training_001\BraTS20_Traini...,G:\Dataset\BraTS20_Training_001\BraTS20_Traini...
BraTS20_Training_002,HGG,52.263,616,GTR,G:\Dataset\BraTS20_Training_002\BraTS20_Traini...,G:\Dataset\BraTS20_Training_002\BraTS20_Traini...,G:\Dataset\BraTS20_Training_002\BraTS20_Traini...,G:\Dataset\BraTS20_Training_002\BraTS20_Traini...
BraTS20_Training_003,HGG,54.301,464,GTR,G:\Dataset\BraTS20_Training_003\BraTS20_Traini...,G:\Dataset\BraTS20_Training_003\BraTS20_Traini...,G:\Dataset\BraTS20_Training_003\BraTS20_Traini...,G:\Dataset\BraTS20_Training_003\BraTS20_Traini...
BraTS20_Training_004,HGG,39.068,788,GTR,G:\Dataset\BraTS20_Training_004\BraTS20_Traini...,G:\Dataset\BraTS20_Training_004\BraTS20_Traini...,G:\Dataset\BraTS20_Training_004\BraTS20_Traini...,G:\Dataset\BraTS20_Training_004\BraTS20_Traini...
BraTS20_Training_005,HGG,68.493,465,GTR,G:\Dataset\BraTS20_Training_005\BraTS20_Traini...,G:\Dataset\BraTS20_Training_005\BraTS20_Traini...,G:\Dataset\BraTS20_Training_005\BraTS20_Traini...,G:\Dataset\BraTS20_Training_005\BraTS20_Traini...


<h3>Preprocesado de los datos</h3>

In [14]:
#Analizando los datos podemos observar que todos los pacientes con un grado de tumor bajo no tienen datos de supervivencia ni de
#extracción del tumor, ni edad. Esto es en gran medida porque estos pacientes tienen un pronostico bastante favorable [1] y la mayoria no necesita cirugia.
#Por eso asumiremos que ninguno de ellos se realizo cirugía, su edad será fijada de forma aleatoria entre el minimo y el maximo de las edades y el tiempo de supervivencia sera aleatorio entre 
#https://www.analesdepediatria.org/es-gliomas-bajo-grado-revision-10-articulo-S1695403314000873#:~:text=La%20supervivencia%20global%20fue%20del%2088%2C3%25%2C%20con%20una,fue%20del%2083%2C7%25.
#https://ascopubs.org/doi/full/10.1200/JOP.2016.018622#:~:text=A%20larger%20study%20of%20216,of%20resection%20was%20%3C%2090%25.

#Segun el estudio [2] un 76% de las personas con una extirpación de menos del 90% del tumor sobrevivieron más de 5 años. Por lo que
#como asumimos que estas personas no se realizaron cirugía ya que no tenemos datos, calcularemos el tiempo de supervivencia como un número
#aleatorio entre 4 y 7 teniendo un 24% de ser entre 4-5 y un 76% de ser entre 5-7
#En cuanto a la edad esta será calcula aleatoriamente en el rango de la media del resto de las edades con dos desviaciones tipicas [media-2dt, media+2dt] 
dataframe.loc[(dataframe['grade'] != 'HGG') & (pd.isna(dataframe['age']) == True)].head(50)

Unnamed: 0_level_0,grade,age,survival_days,t1,t1ce,t2,flair,resection__GTR,resection__NONE,resection__STR
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1


In [8]:
def generate_survival(a, b, c, probability_low, dataframe):
    '''
    '''
    np.random.seed(1)
    indexes = dataframe.loc[(pd.isna(dataframe['age']) == True)].index
    survival_fill = pd.DataFrame([0 for i in range(0, len(indexes))], columns=['fill'], index=indexes)

    for i in range(dataframe['survival_days'].isna().sum()):
        low = np.random.randint((a*365), high=(b*365)+1)
        high = np.random.randint((b*365), high=(c*365)+1)
        probability = np.random.randint(1, high=101)
        
        if probability <= probability_low:
            survival_fill['fill'] = low
        else:
            survival_fill['fill'] = high
            
    return survival_fill['fill']

def generate_age(dataframe):
    '''
    '''
    return 20
    

In [9]:
dataframe['survival_days'].fillna(generate_survival(4,5,7,24, dataframe), inplace=True)
dataframe['age'].fillna(generate_age(dataframe), inplace=True)

grade_encoding = {'LGG':0, 'HGG':1}
dataframe['grade'] = dataframe['grade'].replace(grade_encoding)

dataframe['extent_of_resection'].fillna('NONE', inplace=True)
dataframe = pd.get_dummies(dataframe, columns=['extent_of_resection'], prefix='resection_')


In [18]:
dataframe.loc[(dataframe['grade'] == 0) & (pd.isna(dataframe['age']) == False)].head(50)

Unnamed: 0_level_0,grade,age,survival_days,t1,t1ce,t2,flair,resection__GTR,resection__NONE,resection__STR
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
BraTS20_Training_260,0,20.0,2144,G:\Dataset\BraTS20_Training_260\BraTS20_Traini...,G:\Dataset\BraTS20_Training_260\BraTS20_Traini...,G:\Dataset\BraTS20_Training_260\BraTS20_Traini...,G:\Dataset\BraTS20_Training_260\BraTS20_Traini...,0,1,0
BraTS20_Training_261,0,20.0,2144,G:\Dataset\BraTS20_Training_261\BraTS20_Traini...,G:\Dataset\BraTS20_Training_261\BraTS20_Traini...,G:\Dataset\BraTS20_Training_261\BraTS20_Traini...,G:\Dataset\BraTS20_Training_261\BraTS20_Traini...,0,1,0
BraTS20_Training_262,0,20.0,2144,G:\Dataset\BraTS20_Training_262\BraTS20_Traini...,G:\Dataset\BraTS20_Training_262\BraTS20_Traini...,G:\Dataset\BraTS20_Training_262\BraTS20_Traini...,G:\Dataset\BraTS20_Training_262\BraTS20_Traini...,0,1,0
BraTS20_Training_263,0,20.0,2144,G:\Dataset\BraTS20_Training_263\BraTS20_Traini...,G:\Dataset\BraTS20_Training_263\BraTS20_Traini...,G:\Dataset\BraTS20_Training_263\BraTS20_Traini...,G:\Dataset\BraTS20_Training_263\BraTS20_Traini...,0,1,0
BraTS20_Training_264,0,20.0,2144,G:\Dataset\BraTS20_Training_264\BraTS20_Traini...,G:\Dataset\BraTS20_Training_264\BraTS20_Traini...,G:\Dataset\BraTS20_Training_264\BraTS20_Traini...,G:\Dataset\BraTS20_Training_264\BraTS20_Traini...,0,1,0
BraTS20_Training_265,0,20.0,2144,G:\Dataset\BraTS20_Training_265\BraTS20_Traini...,G:\Dataset\BraTS20_Training_265\BraTS20_Traini...,G:\Dataset\BraTS20_Training_265\BraTS20_Traini...,G:\Dataset\BraTS20_Training_265\BraTS20_Traini...,0,1,0
BraTS20_Training_266,0,20.0,2144,G:\Dataset\BraTS20_Training_266\BraTS20_Traini...,G:\Dataset\BraTS20_Training_266\BraTS20_Traini...,G:\Dataset\BraTS20_Training_266\BraTS20_Traini...,G:\Dataset\BraTS20_Training_266\BraTS20_Traini...,0,1,0
BraTS20_Training_267,0,20.0,2144,G:\Dataset\BraTS20_Training_267\BraTS20_Traini...,G:\Dataset\BraTS20_Training_267\BraTS20_Traini...,G:\Dataset\BraTS20_Training_267\BraTS20_Traini...,G:\Dataset\BraTS20_Training_267\BraTS20_Traini...,0,1,0
BraTS20_Training_268,0,20.0,2144,G:\Dataset\BraTS20_Training_268\BraTS20_Traini...,G:\Dataset\BraTS20_Training_268\BraTS20_Traini...,G:\Dataset\BraTS20_Training_268\BraTS20_Traini...,G:\Dataset\BraTS20_Training_268\BraTS20_Traini...,0,1,0
BraTS20_Training_269,0,20.0,2144,G:\Dataset\BraTS20_Training_269\BraTS20_Traini...,G:\Dataset\BraTS20_Training_269\BraTS20_Traini...,G:\Dataset\BraTS20_Training_269\BraTS20_Traini...,G:\Dataset\BraTS20_Training_269\BraTS20_Traini...,0,1,0


In [11]:
dataframe.head()

Unnamed: 0_level_0,grade,age,survival_days,t1,t1ce,t2,flair,resection__GTR,resection__NONE,resection__STR
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
BraTS20_Training_001,1,60.463,289,G:\Dataset\BraTS20_Training_001\BraTS20_Traini...,G:\Dataset\BraTS20_Training_001\BraTS20_Traini...,G:\Dataset\BraTS20_Training_001\BraTS20_Traini...,G:\Dataset\BraTS20_Training_001\BraTS20_Traini...,1,0,0
BraTS20_Training_002,1,52.263,616,G:\Dataset\BraTS20_Training_002\BraTS20_Traini...,G:\Dataset\BraTS20_Training_002\BraTS20_Traini...,G:\Dataset\BraTS20_Training_002\BraTS20_Traini...,G:\Dataset\BraTS20_Training_002\BraTS20_Traini...,1,0,0
BraTS20_Training_003,1,54.301,464,G:\Dataset\BraTS20_Training_003\BraTS20_Traini...,G:\Dataset\BraTS20_Training_003\BraTS20_Traini...,G:\Dataset\BraTS20_Training_003\BraTS20_Traini...,G:\Dataset\BraTS20_Training_003\BraTS20_Traini...,1,0,0
BraTS20_Training_004,1,39.068,788,G:\Dataset\BraTS20_Training_004\BraTS20_Traini...,G:\Dataset\BraTS20_Training_004\BraTS20_Traini...,G:\Dataset\BraTS20_Training_004\BraTS20_Traini...,G:\Dataset\BraTS20_Training_004\BraTS20_Traini...,1,0,0
BraTS20_Training_005,1,68.493,465,G:\Dataset\BraTS20_Training_005\BraTS20_Traini...,G:\Dataset\BraTS20_Training_005\BraTS20_Traini...,G:\Dataset\BraTS20_Training_005\BraTS20_Traini...,G:\Dataset\BraTS20_Training_005\BraTS20_Traini...,1,0,0


In [10]:
dataframe.describe(include='all')

Unnamed: 0,grade,age,survival_days,t1,t1ce,t2,flair,resection__GTR,resection__NONE,resection__STR
count,369.0,369.0,369.0,369,369,369,369,369.0,369.0,369.0
unique,,,219.0,369,369,369,369,,,
top,,,2144.0,G:\Dataset\BraTS20_Training_097\BraTS20_Traini...,G:\Dataset\BraTS20_Training_132\BraTS20_Traini...,G:\Dataset\BraTS20_Training_318\BraTS20_Traini...,G:\Dataset\BraTS20_Training_258\BraTS20_Traini...,,,
freq,,,133.0,1,1,1,1,,,
mean,0.794038,46.364976,,,,,,0.322493,0.650407,0.0271
std,0.404952,21.973589,,,,,,0.468065,0.477489,0.162596
min,0.0,18.975,,,,,,0.0,0.0,0.0
25%,1.0,20.0,,,,,,0.0,0.0,0.0
50%,1.0,52.348,,,,,,0.0,1.0,0.0
75%,1.0,64.378,,,,,,1.0,1.0,0.0


<h3>Carga y preprocesado de las imagenes</h3>