In [5]:
import numpy as np
import pandas as pd
from glob import glob
from tqdm import tqdm
from sklearn.utils import shuffle
from sklearn.preprocessing import MultiLabelBinarizer
import h5py

df = pd.read_csv('Data_Entry_2017.csv')
bl = pd.read_csv('blacklist.csv')
bl_list = bl['black'].tolist()

# Remove rows with unreasonable ages 
df = df.drop(df.sort_values(by='Patient Age',ascending=False).head(16).index)
df['Patient Age'] = df['Patient Age']/df['Patient Age'].max()

with open('test_list.txt', 'r') as f1:
  x = f1.read().split()
with open('train_val_list.txt', 'r') as f2:
  y = f2.read().split()

train = df.loc[df['Image Index'].isin(y)]
train = train.loc[~train['Image Index'].isin(bl_list)]
train = shuffle(train)
train_files_list = train['Image Index'].tolist()

test = df.loc[df['Image Index'].isin(x)]
test = test.loc[~test['Image Index'].isin(bl_list)]
test = shuffle(test)
test_files_list = test['Image Index'].tolist()

In [6]:
train.shape

(85678, 12)

In [7]:
test.shape

(25258, 12)

In [11]:
# A function to split labels 
def split_labels(label):
    return label.split('|')

# Store all label lists in an array 
a = train['Finding Labels'].apply(split_labels)
b = test['Finding Labels'].apply(split_labels)

In [12]:
# Create MultiLabelBinarizer object
one_hot = MultiLabelBinarizer()

# One-hot encode data
trainL15 = one_hot.fit_transform(np.array(a))
testL15 = one_hot.fit_transform(np.array(b))

# List of labels corresponds to one-hot encode data above
labels_list = one_hot.classes_
print(labels_list)

['Atelectasis' 'Cardiomegaly' 'Consolidation' 'Edema' 'Effusion'
 'Emphysema' 'Fibrosis' 'Hernia' 'Infiltration' 'Mass' 'No Finding'
 'Nodule' 'Pleural_Thickening' 'Pneumonia' 'Pneumothorax']


In [13]:
np.save(open('./efs/cropB/trainL15.dat', 'wb'), trainL15, allow_pickle=False)
np.save(open('./efs/cropB/testL15.dat', 'wb'), testL15, allow_pickle=False)

In [14]:
zero = np.array([0,0,0,0,0,0,0,0,0,0,0,0,0,0])
# reshape train label to 14 labelling style
ret = np.array([1,1,1,1,1,1,1,1,1,1,1,1,1,1])
for i in range(trainL15.shape[0]):
    if(trainL15[i][10]==1):
        ret = np.vstack((ret,zero))
    else:
        ret = np.vstack((ret,np.delete(trainL15[i],10)))
trainL = np.delete(ret,(0), axis=0)

In [15]:
# reshape test label to 14 labelling style
re = np.array([1,1,1,1,1,1,1,1,1,1,1,1,1,1])
for j in range(testL15.shape[0]):
    if(testL15[j][10]==1):
        re = np.vstack((re,zero))
    else:
        re = np.vstack((re,np.delete(testL15[j],10)))
testL = np.delete(re,(0), axis=0)

In [16]:
np.save(open('./efs/cropB/trainL.dat', 'wb'), trainL, allow_pickle=False)
np.save(open('./efs/cropB/train_files_list.dat', 'wb'), train_files_list, allow_pickle=False)
np.save(open('./efs/cropB/test_files_list.dat', 'wb'), test_files_list, allow_pickle=False)

In [22]:
from keras.preprocessing import image                  
from tqdm import tqdm_notebook as tqdm
from keras.preprocessing import image                  
from tqdm import tqdm_notebook as tqdm
import boto3
import tempfile
import matplotlib.image as mpimg
from PIL import ImageFile
from matplotlib.pyplot import imshow

# Helper method to convert images to training tensors 
def path_to_tensor(img_path, shape):
    s3 = boto3.resource('s3', region_name='us-east-1', 
                        aws_access_key_id = 'AKIAJR75PXKNLAFCI3UQ',
                        aws_secret_access_key= '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')
    bucket = s3.Bucket('nih-chest-xrays-dataset')
    object = bucket.Object('images/' + img_path )
    
    ImageFile.LOAD_TRUNCATED_IMAGES = True
    tmp = tempfile.NamedTemporaryFile()
    
    with open(tmp.name, 'wb') as f:
        object.download_fileobj(f)
        # loads RGB image as PIL.Image.Image type
        img = image.load_img(tmp.name, target_size=shape)
        # crop image to 224*224 if shape is 300*300
        cropped_im = img.crop((38,38,262,262))
        #imshow(np.asarray(cropped_im))
        # convert PIL.Image.Image type to 3D tensor with shape (224, 224, 3)
        x = image.img_to_array(cropped_im)/255
        # convert 3D tensor to 4D tensor with shape (1, 224, 224, 3) and return 4D tensor
        return np.expand_dims(x, axis=0)

# Convert images to training tensors 
def paths_to_tensor(img_paths, shape):
    list_of_tensors = [path_to_tensor(img_path, shape) for img_path in tqdm(img_paths)]
    return np.vstack(list_of_tensors)

In [19]:
train.shape

(85678, 12)

In [23]:
# Split training tensors (images)
img_shape = (300,300)
train_tensors = paths_to_tensor(train_files_list[:76200], shape = img_shape)

# Save training tensors (images)
with h5py.File('./efs/cropB/training.hdf5', 'w') as hf:
    hf.create_dataset("training",  data=train_tensors)

HBox(children=(IntProgress(value=0, max=76200), HTML(value='')))

KeyboardInterrupt: 

In [None]:
img_shape = (300,300)
valid_tensors = paths_to_tensor(train_files_list[76200:], shape = img_shape)
np.save(open('./efs/cropB/validation.dat', 'wb'), valid_tensors, allow_pickle=False)

HBox(children=(IntProgress(value=0, max=9478), HTML(value='')))

In [24]:
img_shape = (300,300)
test_tensors = paths_to_tensor(test_files_list[:], shape = img_shape)
np.save(open('./efs/cropB/testing.dat', 'wb'), test_tensors, allow_pickle=False)

HBox(children=(IntProgress(value=0, max=25258), HTML(value='')))

In [20]:
# Split training labels 
train_labels = trainL[:76200]
valid_labels = trainL[76200:]
test_labels = testL[:]
# Save training labels 
np.save(open('./efs/cropB/trainLabels.dat', 'wb'), train_labels, allow_pickle=False)
np.save(open('./efs/cropB/validLabels.dat', 'wb'), valid_labels, allow_pickle=False)
np.save(open('./efs/cropB/testLabels.dat', 'wb'), test_labels, allow_pickle=False)

In [9]:
import numpy as np

with h5py.File('./efs/training.hdf5', 'r') as hf:
    train_tensors = hf['./efs/training'][:]
valid_tensors = np.load('./efs/validation.dat')
test_tensors = np.load('./efs/testing.dat')

train_labels = np.load('./efs/trainLabels.dat')
valid_labels = np.load('./efs/validLabels.dat')
test_labels = np.load('./efs/testLabels.dat')

train_data = np.load('./efs/trainData.dat')
valid_data = np.load('./efs/validData.dat')
test_data = np.load('./efs/testData.dat')

In [1]:
!pip install tqdm
!pip install keras
!pip install tensorflow

Collecting tqdm
[?25l  Downloading https://files.pythonhosted.org/packages/7d/e6/19dfaff08fcbee7f3453e5b537e65a8364f1945f921a36d08be1e2ff3475/tqdm-4.24.0-py2.py3-none-any.whl (43kB)
[K    100% |████████████████████████████████| 51kB 10.8MB/s ta 0:00:01
[31mdistributed 1.21.8 requires msgpack, which is not installed.[0m
[?25hInstalling collected packages: tqdm
Successfully installed tqdm-4.24.0
[33mYou are using pip version 10.0.1, however version 18.0 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
Collecting keras
[?25l  Downloading https://files.pythonhosted.org/packages/34/7d/b1dedde8af99bd82f20ed7e9697aac0597de3049b1f786aa2aac3b9bd4da/Keras-2.2.2-py2.py3-none-any.whl (299kB)
[K    100% |████████████████████████████████| 307kB 15.6MB/s ta 0:00:01
Collecting keras-preprocessing==1.0.2 (from keras)
  Downloading https://files.pythonhosted.org/packages/71/26/1e778ebd737032749824d5cba7dbd3b0cf9234b87ab5ec79f5f0403ca7e9/Keras_Preproces