In [2]:
import numpy as np
import pickle
import pandas as pd
import os
import matplotlib.pyplot as plt
import SimpleITK as sitk
from sklearn.naive_bayes import GaussianNB

In [11]:
def read_kImage(path):
    # Reads the image using SimpleITK
    itkimage = sitk.ReadImage(path)
    # Convert the image to a  numpy array first and then shuffle the dimensions to get axis in the order z,y,x
    ct_scan = sitk.GetArrayFromImage(itkimage)
    return ct_scan

In [12]:
def generate_training_tables():
    basedir = "../1/train/ProstateX-TrainingLesionInformationv2/ProstateX-TrainingLesionInformationv2"
    findingsfile = "{}/{}".format(basedir,"ProstateX-Findings-Train.csv")
    imagesfile = "{}/{}".format(basedir,"ProstateX-Images-Train.csv")
    kimagesfile = "{}/{}".format(basedir,"ProstateX-Images-KTrans-Train.csv")
    #Generamos las 3 tablas básicas a partir de los archivos.cvs
    findings_table = pd.read_csv(findingsfile)
    images_table = pd.read_csv(imagesfile)
    kimages_table = pd.read_csv(kimagesfile)
    #Operación de inner join entre tablas.
    join_kimages_table = pd.merge(left=findings_table, right= kimages_table, how ="inner", on=["ProxID","fid","pos"])
    join_images_table =  pd.merge(left=findings_table, right=  images_table, how ="inner", on=["ProxID","fid","pos"])
    #agregamos una columna para insertar la imagen
    join_images_table["MRI"] = pd.Series(np.zeros((len(join_images_table),2,2,2)).tolist(), index= join_images_table.index)
    join_images_table["MRI3D"] = pd.Series(np.zeros((len(join_images_table),2,2,2)).tolist(), index= join_images_table.index)
    join_images_table["State"] = pd.Series(False, index= join_images_table.index)
    join_kimages_table["Kimg"] = pd.Series(np.zeros((len(join_kimages_table),2,2,2)).tolist(), index= join_kimages_table.index)
    join_kimages_table["Kimg3D"] = pd.Series(np.zeros((len(join_kimages_table),2,2,2)).tolist(), index= join_kimages_table.index)
    join_kimages_table["State"] = pd.Series(False, index= join_kimages_table.index)
    return join_images_table, join_kimages_table

In [13]:
def extract_voxel_data(series_path):
    reader = sitk.ImageSeriesReader()
    dicom_names = reader.GetGDCMSeriesFileNames( series_path )
    reader.SetFileNames(dicom_names)
    image = reader.Execute()
    image_3d = sitk.GetArrayFromImage(image)
    return image_3d

In [14]:
def extract_slice_data(img_3d, ijk):
    state = False
    k_string = ijk.split()[2]
    max_bandwith = img_3d.shape[0]
    possible_k = int (k_string)
    if (possible_k >= max_bandwith):
        possible_k = max_bandwith -1
        slice_index = possible_k
        state = True
        print("The ijk requested is {}, there was an error since the shapes of the image is {}".format(ijk,str(img_3d.shape)))
    else:
        slice_index = possible_k
        print("The ijk requested is {},OK since shapes of the image is {}".format(ijk,str(img_3d.shape)))
    return state, img_3d[slice_index,:,:]

In [15]:
def set_MRI_value(index,image2d,image3d,state):
    join_images_table.at[index,"MRI"] = image2d
    join_images_table.at[index,"MRI3D"] = image3d
    join_images_table.at[index,"State"] = state

In [16]:
def set_kImg_value(index,image2d,image3d,state):
    join_kimages_table.at[index,"Kimg"] = image2d
    join_kimages_table.at[index,"Kimg3D"] = image3d
    join_kimages_table.at[index,"State"] = state

In [121]:
def checkout_Kimgs_problems():
    k_problems_table = join_kimages_table[join_kimages_table["State"] == True] 
    print("There were {} problems".format(len(k_problems_table)))
    percent = 100*float(len(k_problems_table)/len(join_kimages_table))

    print("It was the  {0:.5f}% of the dataset".format(percent))
    display(k_problems_table[["ProxID","ijk","State"]])

In [34]:
def checkout_MRI_table_problems():
    MRI_problems_table = join_images_table[join_images_table["State"] == True] 
    print("There were {} problems".format(len(MRI_problems_table)))
    percent = 100*float(len(MRI_problems_table)/len(join_images_table))
    print("It was the  {0:.5f}% of the dataset".format(percent))
    display(MRI_problems_table[["ProxID","fid","ClinSig","State","Name","DCMSerDescr"]])
    
    patient_problems = MRI_problems_table.ProxID.unique()
    print("The following patients have almost 1 currupted image. {}".format(patient_problems))
    candidate_table = join_images_table
    for patient in patient_problems:
        candidate_table = candidate_table[candidate_table["ProxID"] != patient]
    print("This would be the table with none of the patients with atleast 1 problem.")
    display(candidate_table[["ProxID","fid","ClinSig","State","Name","DCMSerDescr"]])
    del MRI_problems_table, percent,  patient_problems, candidate_table, patient


In [17]:
def insert_k_images():
    for row in range(len(join_kimages_table)):
        path = "../1/Train/ProstateXKtrains-train-fixed"
        patient_id = join_kimages_table.ProxID.iloc[row]
        ijk = join_kimages_table.ijk.iloc[row]
        path = "{}/{}".format(path,patient_id)
        path = "{}/{}-Ktrans.mhd".format(path,patient_id)
        kimage_3d = read_kImage(path)
        state, kimage_2d = extract_slice_data(kimage_3d,ijk)
        set_kImg_value(row,kimage_2d,kimage_3d,state)
        print("Se ha cargado la Kimagen2d #{} para el paciente {} ".format(row,patient_id))
        del path,patient_id,kimage_3d,state


In [18]:
def insert_MRI_images():
    for row in range (len(join_images_table)):
        patient_id = join_images_table.ProxID.iloc[row]
        series_name = join_images_table.DCMSerDescr.iloc[row]
        series_id = join_images_table.DCMSerNum.iloc[row]
        slice_ijk = join_images_table.ijk.iloc[row]
        path = "../1/Train/PROSTATEx/{}/".format(patient_id)#Selects the Patient ID
        path = "{}/{}".format(path,os.listdir(path)[0])#Selects the default folder inside the patient
        sequences = os.listdir(path) # Selects all the sequences
        sequence = [ sequence for sequence in sequences if sequence[0:len(str(series_id))] == str(series_id) ] [0]
        # Selects the only sequence that matches the condition.
        path = "{}/{}".format(path,sequence)
        image_3d = extract_voxel_data(path)
        state, image_2d = extract_slice_data(image_3d,slice_ijk)
        set_MRI_value(row,image_2d,image_3d,state)
        print("Se ha cargado la imagen2d #{} para el paciente {} serie: {} ".format(row,patient_id,series_name))
        del path, patient_id, series_name, series_id,sequence,sequences, image_3d, image_2d

In [140]:
def get_training_testing_tables(zone):
    number_of_cancer_samples=len(join_kimages_table[(join_kimages_table["ClinSig"] == True) & (join_kimages_table["zone"]== zone)])
    number_of_non_cancer_samples = len(join_kimages_table[(join_kimages_table["ClinSig"] == False) & (join_kimages_table["zone"]== zone)])
    training_cancer_samples = number_of_cancer_samples//2
    training_non_cancer_samples = number_of_non_cancer_samples//2
    training_cancer_samples_index = np.random.randint(0,number_of_cancer_samples,training_cancer_samples)
    training_non_cancer_samples_index = np.random.randint(0,number_of_non_cancer_samples,training_non_cancer_samples)
    training_cancer_samples_table = join_kimages_table[(join_kimages_table["ClinSig"] == True) & (join_kimages_table["zone"]==zone )].iloc[training_cancer_samples_index]
    training_non_cancer_samples_table = join_kimages_table[(join_kimages_table["ClinSig"] == False) & (join_kimages_table["zone"]== zone )].iloc[training_non_cancer_samples_index]
    #we get the posible samples that are not taken on the training set.
    samples = [i for i in range(training_cancer_samples)]
    test_cancer_samples_index = [sample for sample in samples if sample not in training_cancer_samples_index]
    test_cancer_samples_index = np.array(test_cancer_samples_index)
    samples = [i for i in range(training_non_cancer_samples)]
    test_non_cancer_samples_index = [sample for sample in samples if sample not in training_non_cancer_samples_index]
    test_non_cancer_samples_index = np.array(test_non_cancer_samples_index)
    test_cancer_samples_table = join_kimages_table[(join_kimages_table["ClinSig"] == True) & (join_kimages_table["zone"]==zone )].iloc[test_cancer_samples_index]
    test_non_cancer_samples_table = join_kimages_table[(join_kimages_table["ClinSig"] == False) & (join_kimages_table["zone"]== zone )].iloc[test_non_cancer_samples_index]
    del samples,test_cancer_samples_index,test_non_cancer_samples_index, training_cancer_samples_index
    training_table = training_cancer_samples_table.append(training_non_cancer_samples_table)
    testing_table = test_cancer_samples_table.append(test_non_cancer_samples_table)
    training_table = training_table[training_table["State"] == False]
    testing_table = testing_table[testing_table["State"]== False]
    training_table = training_table.sample(frac=1).reset_index(drop=True)
    testing_table = testing_table.sample(frac=1).reset_index(drop=True)
    print("We have {} samples for training and {} samples for testing".format(len(training_table), len(testing_table)))

    return training_table,testing_table

In [107]:
def get_data_labels_Kimages(table):
    rows,cols = table.Kimg.iloc[0].shape
    data = np.ndarray(shape=(len(table), rows, cols),
                             dtype=np.float32)
    labels = np.ndarray(shape=(len(table),1),
                             dtype=np.float32)
    for index in range (len(table)):
        curr_rows,curr_cols = table.Kimg.iloc[index].shape
        if((curr_rows != rows or curr_cols != cols)):
            data[index, : , : ] = table.Kimg.iloc[index][:rows,:cols]
            labels[index] = table.ClinSig.iloc[index]
        else:
            data[index, : , : ] = table.Kimg.iloc[index]
            labels[index] = table.ClinSig.iloc[index]
    print("The shape of the data is: {}".format(data.shape) )
    data  = data.reshape((len(table),rows*cols))
    print("The shape of the data was reshaped to: {}".format(data.shape))
    labels = labels.reshape((len(table),))
    del table
    return data, labels

In [20]:
join_images_table, join_kimages_table =  generate_training_tables()


In [None]:
insert_k_images()

In [None]:
insert_MRI_images()

In [35]:
checkout_MRI_table_problems()


There were 88 problems
It was the  2.27449% of the dataset


Unnamed: 0,ProxID,fid,ClinSig,State,Name,DCMSerDescr
14,ProstateX-0001,1,False,True,t2_loc_sag0,t2_loc sag
27,ProstateX-0002,1,True,True,t2_loc_sag0,t2_loc sag
162,ProstateX-0010,1,False,True,t2_loc_sag0,t2_loc sag
208,ProstateX-0013,1,False,True,t2_loc_sag0,t2_loc sag
311,ProstateX-0022,1,False,True,t2_loc_sag0,t2_loc sag
321,ProstateX-0023,1,False,True,t2_loc_sag0,t2_loc sag
349,ProstateX-0025,1,False,True,ep2d_diff_tra_DYNDIST_ADC0,ep2d_diff_tra_DYNDIST_ADC
357,ProstateX-0025,1,False,True,ep2d_diff_tra_DYNDISTCALC_BVAL0,ep2d_diff_tra_DYNDISTCALC_BVAL
360,ProstateX-0025,1,False,True,t2_tse_cor0,t2_tse_cor
363,ProstateX-0025,1,False,True,t2_tse_tra0,t2_tse_tra


The following patients have almost 1 currupted image. ['ProstateX-0001' 'ProstateX-0002' 'ProstateX-0010' 'ProstateX-0013'
 'ProstateX-0022' 'ProstateX-0023' 'ProstateX-0025' 'ProstateX-0029'
 'ProstateX-0031' 'ProstateX-0032' 'ProstateX-0035' 'ProstateX-0038'
 'ProstateX-0039' 'ProstateX-0041' 'ProstateX-0046' 'ProstateX-0061'
 'ProstateX-0067' 'ProstateX-0069' 'ProstateX-0074' 'ProstateX-0077'
 'ProstateX-0082' 'ProstateX-0084' 'ProstateX-0085' 'ProstateX-0086'
 'ProstateX-0088' 'ProstateX-0092' 'ProstateX-0095' 'ProstateX-0096'
 'ProstateX-0098' 'ProstateX-0104' 'ProstateX-0105' 'ProstateX-0120'
 'ProstateX-0127' 'ProstateX-0130' 'ProstateX-0134' 'ProstateX-0135'
 'ProstateX-0137' 'ProstateX-0138' 'ProstateX-0144' 'ProstateX-0146'
 'ProstateX-0147' 'ProstateX-0148' 'ProstateX-0149' 'ProstateX-0151'
 'ProstateX-0154' 'ProstateX-0155' 'ProstateX-0157' 'ProstateX-0158'
 'ProstateX-0159' 'ProstateX-0161' 'ProstateX-0163' 'ProstateX-0171'
 'ProstateX-0172' 'ProstateX-0173' 'ProstateX-017

Unnamed: 0,ProxID,fid,ClinSig,State,Name,DCMSerDescr
0,ProstateX-0000,1,True,False,ep2d_diff_tra_DYNDIST_ADC0,ep2d_diff_tra_DYNDIST_ADC
1,ProstateX-0000,1,True,False,ep2d_diff_tra_DYNDIST0,ep2d_diff_tra_DYNDIST
2,ProstateX-0000,1,True,False,ep2d_diff_tra_DYNDIST1,ep2d_diff_tra_DYNDIST
3,ProstateX-0000,1,True,False,ep2d_diff_tra_DYNDIST2,ep2d_diff_tra_DYNDIST
4,ProstateX-0000,1,True,False,ep2d_diff_tra_DYNDISTCALC_BVAL0,ep2d_diff_tra_DYNDISTCALC_BVAL
5,ProstateX-0000,1,True,False,t2_tse_cor0,t2_tse_cor
6,ProstateX-0000,1,True,False,t2_tse_sag0,t2_tse_sag
7,ProstateX-0000,1,True,False,t2_tse_tra0,t2_tse_tra
8,ProstateX-0000,1,True,False,tfl_3d_PD_ref_tra_1_5x1_5_t30,tfl_3d PD ref_tra_1.5x1.5_t3
44,ProstateX-0003,1,False,False,ep2d_diff_tra_DYNDIST_ADC0,ep2d_diff_tra_DYNDIST_ADC


In [122]:
checkout_Kimgs_problems()

There were 3 problems
It was the  0.89552% of the dataset


Unnamed: 0,ProxID,ijk,State
34,ProstateX-0025,89 67 24,True
159,ProstateX-0105,63 78 17,True
248,ProstateX-0154,57 72 16,True


In [155]:
zones = join_kimages_table.zone.unique()
zone = zones[0]
print(zone)
training_table, testing_table = get_training_testing_tables(zone)
training_table[["ProxID", "ClinSig"]]
print ("There is a total of {} patients with cancer at the training data".format(sum(training_table.ClinSig)))
print ("There is a total of {} patients with cancer at the testing data".format(sum (testing_table.ClinSig)))
get_training_testing_tables2(zone)

PZ
We have 96 samples for training and 62 samples for testing
There is a total of 18 patients with cancer at the training data
There is a total of 11 patients with cancer at the testing data
18 18


In [151]:
training_data, training_labels = get_data_labels_Kimages(training_table)
testing_data, testing_labels = get_data_labels_Kimages(testing_table)

The shape of the data is: (97, 128, 128)
The shape of the data was reshaped to: (97, 16384)
The shape of the data is: (62, 128, 128)
The shape of the data was reshaped to: (62, 16384)


In [152]:
classifier1 = GaussianNB()
classifier1.fit(training_data, training_labels)
result = classifier1.score(testing_data,testing_labels)
print ("This was an experiment using GaussianNB classifier and only kimages of the {}.".format(zone)) 
print("the number of samples in training were {} ".format(
    len(training_table)) + "and the number of samples in testing were {} ".format(len(testing_data)))

print("the score of the accuracy in this experiment was {0:.5f}%".format(float(result)*100))

This was an experiment using GaussianNB classifier and only kimages of the PZ.
the number of samples in training were 97 and the number of samples in testing were 62 
the score of the accuracy in this experiment was 79.03226%


In [46]:
60*0.75

45.0

In [159]:
base_table = join_kimages_table[(join_kimages_table["State"] == False) & (join_kimages_table["zone"] == zone)]
cancer_table_indexes = base_table[base_table["ClinSig"] == True].index.values
non_cancer_table = base_table[base_table["ClinSig"] == False].index.values
training_cancer_samples = len(cancer_table_indexes)//2
training_non_cancer_samples = (len(cancer_table_indexes)//2) 
print (training_cancer_samples,training_non_cancer_samples)
print(cancer_table_indexes[:training_cancer_samples])
print(cancer_table_indexes[]tra)

18 18


array([  0,   2,   7,   8,  25,  45,  49,  64,  67,  72,  97, 112, 140,
       145, 153, 176, 180, 184], dtype=int64)