In [9]:
import numpy as np
import pandas as pd
from glob import glob
from tqdm import tqdm
from sklearn.utils import shuffle
from sklearn.preprocessing import MultiLabelBinarizer
import h5py

df = pd.read_csv('Data_Entry_2017.csv')

diseases = ['Cardiomegaly','Emphysema','Effusion','Hernia','Nodule','Pneumothorax','Atelectasis','Pleural_Thickening','Mass','Edema','Consolidation','Infiltration','Fibrosis','Pneumonia']
# Remove rows with unreasonable ages 
df = df.drop(df.sort_values(by='Patient Age',ascending=False).head(16).index)
df['Patient Age'] = df['Patient Age']/df['Patient Age'].max()

with open('test_list.txt', 'r') as f1:
  x = f1.read().split()
with open('train_val_list.txt', 'r') as f2:
  y = f2.read().split()

train = df.loc[df['Image Index'].isin(y)]
train = train[train['Finding Labels'] != 'No Finding']

test = df.loc[df['Image Index'].isin(x)]
test = test[test['Finding Labels'] != 'No Finding']

z = test.head(5500)
train = pd.concat([train,z])
test = test.tail(10231)

In [10]:
train.shape

(41520, 12)

In [11]:
test.shape

(10231, 12)

In [None]:
train = shuffle(train)
train_files_list = train['Image Index'].tolist()

test = shuffle(test)
test_files_list = test['Image Index'].tolist()

In [11]:
# A function to split labels 
def split_labels(label):
    return label.split('|')

# Store all label lists in an array 
a = train['Finding Labels'].apply(split_labels)
b = test['Finding Labels'].apply(split_labels)

In [12]:
# Create MultiLabelBinarizer object
one_hot = MultiLabelBinarizer()

# One-hot encode data
trainL = one_hot.fit_transform(np.array(a))
testL = one_hot.fit_transform(np.array(b))

# List of labels corresponds to one-hot encode data above
labels_list = one_hot.classes_
print(labels_list)

['Atelectasis' 'Cardiomegaly' 'Consolidation' 'Edema' 'Effusion'
 'Emphysema' 'Fibrosis' 'Hernia' 'Infiltration' 'Mass' 'Nodule'
 'Pleural_Thickening' 'Pneumonia' 'Pneumothorax']


In [13]:
np.save(open('./efs/disease712/train_files_list.dat', 'wb'), train_files_list, allow_pickle=False)
np.save(open('./efs/disease712/trainL.dat', 'wb'), trainL, allow_pickle=False)
np.save(open('./efs/disease712/test_files_list.dat', 'wb'), test_files_list, allow_pickle=False)
np.save(open('./efs/disease712/testL.dat', 'wb'), testL, allow_pickle=False)

In [14]:
from keras.preprocessing import image                  
from tqdm import tqdm_notebook as tqdm
from keras.preprocessing import image                  
from tqdm import tqdm_notebook as tqdm
import boto3
import tempfile
import matplotlib.image as mpimg
from PIL import ImageFile

# Helper method to convert images to training tensors 
def path_to_tensor(img_path, shape):
    s3 = boto3.resource('s3', region_name='us-east-1', 
                        aws_access_key_id = 'AKIAJR75PXKNLAFCI3UQ',
                        aws_secret_access_key= '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')
    bucket = s3.Bucket('nih-chest-xrays-dataset')
    object = bucket.Object('images/' + img_path )
    
    ImageFile.LOAD_TRUNCATED_IMAGES = True
    tmp = tempfile.NamedTemporaryFile()
    
    with open(tmp.name, 'wb') as f:
        object.download_fileobj(f)
        # loads RGB image as PIL.Image.Image type
        img = image.load_img(tmp.name, target_size=shape)
        # convert PIL.Image.Image type to 3D tensor with shape (224, 224, 3)
        x = image.img_to_array(img)/255
        # convert 3D tensor to 4D tensor with shape (1, 224, 224, 3) and return 4D tensor
        return np.expand_dims(x, axis=0)

# Convert images to training tensors 
def paths_to_tensor(img_paths, shape):
    list_of_tensors = [path_to_tensor(img_path, shape) for img_path in tqdm(img_paths)]
    return np.vstack(list_of_tensors)

Using TensorFlow backend.


In [None]:
# Split training tensors (images)
img_shape = (224, 224)
train_tensors = paths_to_tensor(train_files_list[:36330], shape = img_shape)

# Save training tensors (images)
with h5py.File('./efs/disease712/training.hdf5', 'w') as hf:
    hf.create_dataset("training",  data=train_tensors)

HBox(children=(IntProgress(value=0, max=32400), HTML(value='')))

In [17]:
valid_tensors = paths_to_tensor(train_files_list[36330:], shape = img_shape)
np.save(open('./efs/disease712/validation.dat', 'wb'), valid_tensors, allow_pickle=False)

HBox(children=(IntProgress(value=0, max=3620), HTML(value='')))

In [19]:
test_tensors = paths_to_tensor(test_files_list[:], shape = img_shape)
np.save(open('./efs/disease712/testing.dat', 'wb'), test_tensors, allow_pickle=False)

HBox(children=(IntProgress(value=0, max=15731), HTML(value='')))

In [16]:
# Split training labels 
train_labels = trainL[:36330]
valid_labels = trainL[36330:]
test_labels = testL[:]
# Save training labels 
np.save(open('./efs/disease712/trainLabels.dat', 'wb'), train_labels, allow_pickle=False)
np.save(open('./efs/disease712/validLabels.dat', 'wb'), valid_labels, allow_pickle=False)
np.save(open('./efs/disease712/testLabels.dat', 'wb'), test_labels, allow_pickle=False)

In [9]:
import numpy as np

with h5py.File('./efs/training.hdf5', 'r') as hf:
    train_tensors = hf['./efs/training'][:]
valid_tensors = np.load('./efs/validation.dat')
test_tensors = np.load('./efs/testing.dat')

train_labels = np.load('./efs/trainLabels.dat')
valid_labels = np.load('./efs/validLabels.dat')
test_labels = np.load('./efs/testLabels.dat')

In [1]:
!pip install tqdm
!pip install keras
!pip install tensorflow

[31mdistributed 1.21.8 requires msgpack, which is not installed.[0m
[33mYou are using pip version 10.0.1, however version 18.0 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
[31mdistributed 1.21.8 requires msgpack, which is not installed.[0m
[33mYou are using pip version 10.0.1, however version 18.0 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
[31mdistributed 1.21.8 requires msgpack, which is not installed.[0m
[33mYou are using pip version 10.0.1, however version 18.0 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
