In [31]:
import numpy as np
import pandas as pd
from glob import glob
from tqdm import tqdm
from sklearn.utils import shuffle
from sklearn.utils import resample
from sklearn.preprocessing import MultiLabelBinarizer
import h5py

df = pd.read_csv('Data_Entry_2017.csv')
bl = pd.read_csv('blacklist.csv')
bl_list = bl['black'].tolist()

# Remove rows with unreasonable ages 
df = df.drop(df.sort_values(by='Patient Age',ascending=False).head(16).index)
df['Patient Age'] = df['Patient Age']/df['Patient Age'].max()

with open('test_list.txt', 'r') as f1:
  x = f1.read().split()
with open('train_val_list.txt', 'r') as f2:
  y = f2.read().split()

train = df.loc[df['Image Index'].isin(y)]
train = train.loc[~train['Image Index'].isin(bl_list)]
#train = shuffle(train)
#train_files_list = train['Image Index'].tolist()

test = df.loc[df['Image Index'].isin(x)]
test = test.loc[~test['Image Index'].isin(bl_list)]
test = shuffle(test)
test_files_list = test['Image Index'].tolist()

In [32]:
train.shape

(85678, 12)

In [33]:
from sklearn.utils import resample
labels_list = ['Atelectasis','Cardiomegaly','Consolidation','Edema','Effusion',
 'Emphysema','Fibrosis','Hernia','Infiltration','Mass','No Finding',
 'Nodule','Pleural_Thickening','Pneumonia','Pneumothorax']

fibrosis = train[train['Finding Labels'].str.contains('Fibrosis')]

Ca = train[train['Finding Labels'].str.contains('Cardiomegaly')]
Co = train[train['Finding Labels'].str.contains('Consolidation')]
Ed = train[train['Finding Labels'].str.contains('Edema')]
Em = train[train['Finding Labels'].str.contains('Emphysema')]
Fi = train[train['Finding Labels'].str.contains('Fibrosis')]
He = train[train['Finding Labels'].str.contains('Hernia')]
PT = train[train['Finding Labels'].str.contains('Pleural_Thickening')]
P_a = train[train['Finding Labels'].str.contains('Pneumonia')]
NF = train.loc[train['Finding Labels'] == 'No Finding']


rest = train[~train['Finding Labels'].str.contains('Cardiomegaly')]
rest = rest[~rest['Finding Labels'].str.contains('Consolidation')]
rest = rest[~rest['Finding Labels'].str.contains('Edema')]
rest = rest[~rest['Finding Labels'].str.contains('Emphysema')]
rest = rest[~rest['Finding Labels'].str.contains('Fibrosis')]
rest = rest[~rest['Finding Labels'].str.contains('Hernia')]
rest = rest[~rest['Finding Labels'].str.contains('Pleural_Thickening')]
rest = rest[~rest['Finding Labels'].str.contains('Pneumonia')]
rest = rest[~rest['Finding Labels'].str.contains('No Finding')]


# Up-sample Minority Class
df_Ca = resample(Ca, replace=True, n_samples=3500, random_state=123)
df_Co = resample(Co, replace=True, n_samples=3500, random_state=123)
df_Ed = resample(Ed, replace=True, n_samples=3500, random_state=123)
df_Em = resample(Em, replace=True, n_samples=3500, random_state=123)
df_Fi = resample(Fi, replace=True, n_samples=3500, random_state=123)
df_He = resample(He, replace=True, n_samples=2500, random_state=123)
df_PT = resample(PT, replace=True, n_samples=3500, random_state=123)
df_P_a = resample(P_a, replace=True, n_samples=3500, random_state=123)

# Down-sample Majority Class 
df_NF = resample(NF, replace=False, n_samples=35000, random_state=123)


# Combine minority class with downsampled majority class
df_balance = pd.concat([df_Ca,df_Co,df_Ed,df_Em,df_Fi,df_He,df_NF,df_PT,df_P_a,rest])
 
# Display new class counts
df_balance = shuffle(df_balance)
train_files_list = df_balance['Image Index'].tolist()
df_balance.shape

(86930, 12)

In [34]:
rest.shape

(24930, 12)

In [35]:
# A function to split labels 
def split_labels(label):
    return label.split('|')

# Store all label lists in an array 
a = df_balance['Finding Labels'].apply(split_labels)
b = test['Finding Labels'].apply(split_labels)

In [36]:
# Create MultiLabelBinarizer object
one_hot = MultiLabelBinarizer()

# One-hot encode data
trainL15 = one_hot.fit_transform(np.array(a))
testL15 = one_hot.fit_transform(np.array(b))

# List of labels corresponds to one-hot encode data above
labels_list = one_hot.classes_
print(labels_list)

['Atelectasis' 'Cardiomegaly' 'Consolidation' 'Edema' 'Effusion'
 'Emphysema' 'Fibrosis' 'Hernia' 'Infiltration' 'Mass' 'No Finding'
 'Nodule' 'Pleural_Thickening' 'Pneumonia' 'Pneumothorax']


In [37]:
zero = np.array([0,0,0,0,0,0,0,0,0,0,0,0,0,0])
# reshape train label to 14 labelling style
ret = np.array([1,1,1,1,1,1,1,1,1,1,1,1,1,1])
for i in range(trainL15.shape[0]):
    if(trainL15[i][10]==1):
        ret = np.vstack((ret,zero))
    else:
        ret = np.vstack((ret,np.delete(trainL15[i],10)))
trainL = np.delete(ret,(0), axis=0)

In [38]:
len(trainL)

86930

In [39]:
np.save(open('./efs/crop300balanced/trainL.dat', 'wb'), trainL, allow_pickle=False)
np.save(open('./efs/crop300balanced/train_files_list.dat', 'wb'), train_files_list, allow_pickle=False)
#np.save(open('./efs/crop300balanced/test_files_list.dat', 'wb'), test_files_list, allow_pickle=False)

In [40]:
from keras.preprocessing import image                  
from tqdm import tqdm_notebook as tqdm
from keras.preprocessing import image                  
from tqdm import tqdm_notebook as tqdm
import boto3
import tempfile
import matplotlib.image as mpimg
from PIL import ImageFile
from matplotlib.pyplot import imshow

# Helper method to convert images to training tensors 
def path_to_tensor(img_path, shape):
    s3 = boto3.resource('s3', region_name='us-east-1', 
                        aws_access_key_id = 'AKIAJR75PXKNLAFCI3UQ',
                        aws_secret_access_key= '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')
    bucket = s3.Bucket('nih-chest-xrays-dataset')
    object = bucket.Object('images/' + img_path )
    
    ImageFile.LOAD_TRUNCATED_IMAGES = True
    tmp = tempfile.NamedTemporaryFile()
    
    with open(tmp.name, 'wb') as f:
        object.download_fileobj(f)
        # loads RGB image as PIL.Image.Image type
        img = image.load_img(tmp.name, target_size=shape)
        # crop image to 224*224 if shape is 300*300
        cropped_im = img.crop((50,50,350,350))
        #imshow(np.asarray(img))
        x = image.img_to_array(cropped_im)/255
        # convert 3D tensor to 4D tensor with shape (1, 224, 224, 3) and return 4D tensor
        return np.expand_dims(x, axis=0)

# Convert images to training tensors 
def paths_to_tensor(img_paths, shape):
    list_of_tensors = [path_to_tensor(img_path, shape) for img_path in tqdm(img_paths)]
    return np.vstack(list_of_tensors)

In [44]:
# Split training tensors (images)
img_shape = (400,400)
train_tensors = paths_to_tensor(train_files_list[:76000], shape = img_shape)

# Save training tensors (images)
with h5py.File('./efs/crop300balanced/training.hdf5', 'w') as hf:
    hf.create_dataset("training",  data=train_tensors)

HBox(children=(IntProgress(value=0, max=76000), HTML(value='')))

In [45]:
img_shape = (400,400)
valid_tensors = paths_to_tensor(train_files_list[76000:], shape = img_shape)
np.save(open('./efs/crop300balanced/validation.dat', 'wb'), valid_tensors, allow_pickle=False)

HBox(children=(IntProgress(value=0, max=10930), HTML(value='')))

In [41]:
# Split training labels 
train_labels = trainL[:76000]
valid_labels = trainL[76000:]
test_labels = testL[:]
# Save training labels 
np.save(open('./efs/crop300balanced/trainLabels.dat', 'wb'), train_labels, allow_pickle=False)
np.save(open('./efs/crop300balanced/validLabels.dat', 'wb'), valid_labels, allow_pickle=False)

In [9]:
import numpy as np

with h5py.File('./efs/training.hdf5', 'r') as hf:
    train_tensors = hf['./efs/training'][:]
valid_tensors = np.load('./efs/validation.dat')
test_tensors = np.load('./efs/testing.dat')

train_labels = np.load('./efs/trainLabels.dat')
valid_labels = np.load('./efs/validLabels.dat')
test_labels = np.load('./efs/testLabels.dat')

train_data = np.load('./efs/trainData.dat')
valid_data = np.load('./efs/validData.dat')
test_data = np.load('./efs/testData.dat')

In [1]:
!pip install tqdm
!pip install keras
!pip install tensorflow

Collecting tqdm
[?25l  Downloading https://files.pythonhosted.org/packages/7d/e6/19dfaff08fcbee7f3453e5b537e65a8364f1945f921a36d08be1e2ff3475/tqdm-4.24.0-py2.py3-none-any.whl (43kB)
[K    100% |████████████████████████████████| 51kB 10.8MB/s ta 0:00:01
[31mdistributed 1.21.8 requires msgpack, which is not installed.[0m
[?25hInstalling collected packages: tqdm
Successfully installed tqdm-4.24.0
[33mYou are using pip version 10.0.1, however version 18.0 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
Collecting keras
[?25l  Downloading https://files.pythonhosted.org/packages/34/7d/b1dedde8af99bd82f20ed7e9697aac0597de3049b1f786aa2aac3b9bd4da/Keras-2.2.2-py2.py3-none-any.whl (299kB)
[K    100% |████████████████████████████████| 307kB 15.6MB/s ta 0:00:01
Collecting keras-preprocessing==1.0.2 (from keras)
  Downloading https://files.pythonhosted.org/packages/71/26/1e778ebd737032749824d5cba7dbd3b0cf9234b87ab5ec79f5f0403ca7e9/Keras_Preproces