In [13]:
import numpy as np
#import png
import pydicom as dicom
import os
import glob
import pandas as pd
import matplotlib.pyplot as plt
import functools as ft
import re
import cv2
import PIL # optional
import csv
from pydicom.pixel_data_handlers.util import apply_color_lut
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier 
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import train_test_split

In [14]:
feat_names = ["tum_loc", "side_of_tum_epicenter", "eloq_brain", "enh_quality", "prop_enh", "prop_ncet", "prop_necr", "cysts", "mf_or_mc", "t1_f_ratio", "thick_em", "def_em", "def_nem", "prop_edema", "edema_crosses_ml", "hem", "diff", "pial_inv", "epe_inv", "cort_involv", "deep_wm_inv", "ncet_crosses_ml", "enh_tum_crosses_ml", "satellites", "calv_remodel", "ext_res_enh_tum", "ext_res_ncet", "ext_res_vas_edema", "les_size1", "les_size2"]
feat_mapping = dict(zip(["f%d" % i for i in range(1, 31)], feat_names))

categorical_cols = [1, 2, 3, 4, 9, 10, 11, 12, 13, 17]
yes_no_cols = [8, 15, 16, 18, 19, 20, 21, 22, 23, 24, 25]
cm_cols = [29, 30]
percent_cols = [5, 6, 7, 14, 26, 27, 28] 

class Utils():    
    def show_dcm(pixel_array):
        plt.imshow(pixel_array, cmap = plt.get_cmap("bone"))
        
    def dcm_downsample(pixel_array, interval):
        return pixel_array[::interval, ::interval]
    

def union_categorical_features(series):

    as_int_tups = []
    for categories in series.tolist():
        if type(categories) == str:
            as_int_tups.append(map(int, re.findall(r"\d+", categories)))
        else:
            as_int_tups.append([categories])
        
    as_int_sets = [set(tup) for tup in as_int_tups]
    return tuple(ft.reduce(lambda x, y: x.union(y), as_int_sets))

def mri_group_categorical_feats(group_df):
    regrouping = {}
    categorical_feats = ["f%d" % d for d in categorical_cols]
    
    for feat in categorical_feats:
        regrouping[feat] = union_categorical_features(group_df[feat])
        
    return pd.Series(regrouping, index=categorical_feats)


In [15]:
REMBRANDT_PATH = "C:\\Users\\Sandhya Rao\\Desktop\\REMBRANDT/"
CLINICAL_DATA_PATH = "C:\\Users\\Sandhya Rao\\Downloads\\clinical_2014-01-16.xlsx"
MRI_FEATURE_PATH = "C:\\Users\\Sandhya Rao\\Downloads\\VASARI_MRI_features (gmdi - wiki).xls"

In [16]:
clinical_df = pd.read_excel(CLINICAL_DATA_PATH)
mri_df = pd.read_excel(MRI_FEATURE_PATH)
mri_df = mri_df.groupby("GMDI").apply(mri_group_categorical_feats)
clinical_df = clinical_df.rename(columns={'Sample':'GMDI'})
mri_df = mri_df.rename_axis("GMDI").reset_index()
df = pd.merge(clinical_df, mri_df, how="inner", on="GMDI")
df["GMDI"] = df["GMDI"].str.replace("_","-")
labelsdf=pd.DataFrame()
labelsdf["Labels"]=df[" Disease"]
labelsdf.head

<bound method NDFrame.head of                 Labels
0          ASTROCYTOMA
1                  GBM
2          ASTROCYTOMA
3                  GBM
4                  GBM
5                  GBM
6                  GBM
7    OLIGODENDROGLIOMA
8                  GBM
9    OLIGODENDROGLIOMA
10                 GBM
11                 GBM
12   OLIGODENDROGLIOMA
13                 GBM
14         ASTROCYTOMA
15                 GBM
16         ASTROCYTOMA
17                 GBM
18                 GBM
19                 GBM
20   OLIGODENDROGLIOMA
21                 GBM
22                 GBM
23                 GBM
24                 GBM
25         ASTROCYTOMA
26                 GBM
27         ASTROCYTOMA
28   OLIGODENDROGLIOMA
29         ASTROCYTOMA
30                 GBM
31                 GBM>

In [17]:
nparr=np.array(labelsdf["Labels"])
ll = nparr.reshape((len(nparr), 1))

In [18]:
lb = preprocessing.LabelBinarizer()

lb.fit(ll)
lb.classes_

array([' ASTROCYTOMA', ' GBM', ' OLIGODENDROGLIOMA'], dtype='<U18')

In [19]:
Y=pd.DataFrame()
#Y["Labels"]
#print(type(
Y["id"]=df["GMDI"]
Y["labels"]=list(lb.transform(ll))
#))
Y

Unnamed: 0,id,labels
0,900-00-1961,"[1, 0, 0]"
1,900-00-5332,"[0, 1, 0]"
2,900-00-5308,"[1, 0, 0]"
3,900-00-5316,"[0, 1, 0]"
4,900-00-5317,"[0, 1, 0]"
5,900-00-5338,"[0, 1, 0]"
6,900-00-5339,"[0, 1, 0]"
7,900-00-5341,"[0, 0, 1]"
8,900-00-5342,"[0, 1, 0]"
9,900-00-5345,"[0, 0, 1]"


In [20]:
len(Y)

32

In [21]:
Y=Y.set_index("id")
Y

Unnamed: 0_level_0,labels
id,Unnamed: 1_level_1
900-00-1961,"[1, 0, 0]"
900-00-5332,"[0, 1, 0]"
900-00-5308,"[1, 0, 0]"
900-00-5316,"[0, 1, 0]"
900-00-5317,"[0, 1, 0]"
900-00-5338,"[0, 1, 0]"
900-00-5339,"[0, 1, 0]"
900-00-5341,"[0, 0, 1]"
900-00-5342,"[0, 1, 0]"
900-00-5345,"[0, 0, 1]"


In [23]:
dcm_paths = []
image_list=[]
patient_list=[]
path_list=[]
label_list=[]
pathtemp=[]
n=1
for patient_id in df["GMDI"]:
    pathtemp=glob.glob(REMBRANDT_PATH + "%s/*/*/*.dcm" % patient_id)
#    n=n+1
#    print("--------------------------------------")
    for path in pathtemp:
        dicomimg=dicom.dcmread(path) 
        img = dicomimg.pixel_array
        #rgb= apply_color_lut(img, palette='PET')
        #data_downsampling = rgb[::2, ::2]    
        resized = cv2.resize(img, (256,256), interpolation = cv2.INTER_CUBIC)
        image_list.append(resized)
        patient_list.append(patient_id)
        label_list.append(Y.loc[str(patient_id),"labels"])
        path_list.append(path)
    #dcm_paths+=pathtemp
    #if n==3:
        #break
print("No. of paths : ",len(path_list))
print("No. of images : ",len(image_list))
print("No. of patients : ",len(patient_list))
print("No. of labels : ",len(label_list))

No. of paths :  21223
No. of images :  21223
No. of patients :  21223
No. of labels :  21223


In [27]:
count1=0
count2=0
count3=0
count4=0
count5=0
total=0
img_list=[]
for x in image_list:
    img=x
    sh=img.shape
    if sh==(128, 128) :
        count1+=1
    elif sh==(256, 256) :
        #img=img[::2, ::2]
        count2+=1
    elif sh==(288, 288) :
        count3+=1
    elif sh==(432, 432) :
        count4+=1
    elif sh==(512, 512) :
        #img=img[::4, ::4]
        count5+=1
    else:
        print(sh)
    total+=1
    img_list.append(img)
        #print(img_2d_scaled.shape,img_2d_scaled.dtype)
print("128: ",count1)
print("256: ",count2)
print("288: ",count3)
print("432: ",count4)
print("512: ",count5)
print(count1+count2+count3+count4+count5)
print("Total: ",total)
print(len(image_list))

    #img_list.append(img)

128:  0
256:  21223
288:  0
432:  0
512:  0
21223
Total:  21223
21223


In [26]:
for i in range(21223):
    sh=image_list[i].shape
    if sh!=(256, 256) :
        print(sh)

In [None]:
p='C:\\Users\\Sandhya Rao\\Desktop\\REMBRANDT/900-00-5380\\07-03-2005-36342\\601-41047\\000000.dcm'
i=dicom.dcmread(p)
img=i.pixel_array
plt.imshow(img,cmap='gray')
#i= apply_color_lut(img, palette='PET')
resized = cv2.resize(img, (128,128), interpolation = cv2.INTER_CUBIC)
#label_list
resized.shape

In [None]:
plt.imshow(resized,cmap='gray')

In [None]:
print(resized.shape)
print(img.shape)

In [None]:
plt.imshow(resized)

In [28]:
x_train= image_list
y_train= label_list
train_imgname=path_list
train=list(zip(x_train,y_train,train_imgname))

In [None]:
#For testing
''''
for i in range(0,21223,1000):
    print(path_list[i])
    print(patient_list[i])
    print(label_list[i])
    print("-------------------------")

In [29]:
import random
random.shuffle(train)
x_train,y_train,train_imgname=zip(*train)

In [30]:
X_train, X_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.3)

In [31]:
print((y_train[0].shape))
X_train[0].shape
#print( nx, ny)

(3,)


(256, 256)

In [33]:
import h5py
hf = h5py.File('C:\\Users\\Sandhya Rao\\Desktop\\REMBRANDT/data.h5', 'w')

OSError: Unable to create file (unable to truncate a file which is already open)

In [39]:
X=np.array(image_list)
Y=np.array(label_list)
print(type(X))

<class 'numpy.ndarray'>


In [40]:
hf.create_dataset('X_train', data=X)
hf.create_dataset('Y_train', data=Y)

<HDF5 dataset "Y_train": shape (21223, 3), type "<i4">

In [41]:
hf.close()