In [1]:
import numpy as np
import pandas as pd
from glob import glob
from sklearn.datasets import load_files
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
import os
import shutil

from keras.preprocessing.image import ImageDataGenerator
from keras.applications import VGG16, DenseNet121, InceptionV3
from keras.utils import np_utils
from keras.applications.inception_v3 import InceptionV3
from keras.callbacks import ModelCheckpoint, LearningRateScheduler

from keras.layers import Conv2D, MaxPooling2D, GlobalAveragePooling2D, BatchNormalization
from keras.layers import Dropout, Flatten, Dense, Activation
from keras.models import Sequential, Model
from keras.callbacks import ModelCheckpoint
from keras.optimizers import Adam
from keras import backend as K
from tensorflow.python.client import device_lib

Using TensorFlow backend.


In [None]:
print(device_lib.list_local_devices())
print(K.tensorflow_backend._get_available_gpus())

In [None]:
!sudo chown -R ds:ds /data

In [2]:
DF_DIR = "../data/Data_Entry_2017.csv"
IMG_DIR = "/data/xray_chest_final"
TRAIN_LIST = "../data/train_val_list.txt"
TEST_LIST = "../data/test_list.txt"

In [3]:
df_data_entry = pd.read_csv(DF_DIR)
df = df_data_entry.iloc[:,:11]
df.head()

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y]
0,00000001_000.png,Cardiomegaly,0,1,58,M,PA,2682,2749,0.143,0.143
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,0.143
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,0.168
3,00000002_000.png,No Finding,0,2,81,M,PA,2500,2048,0.171,0.171
4,00000003_000.png,Hernia,0,3,81,F,PA,2582,2991,0.143,0.143


In [None]:
# rows having no. of disease
df['labels_count'] = df['Finding Labels'].apply(lambda text: len(text.split('|')) if(text != 'No Finding') else 0)
df.head()

In [None]:
#Open train/test lists
with open(TRAIN_LIST, 'r') as train_items:
    train_list = train_items.readlines()
train_list = [item.strip() for item in train_list]

with open(TEST_LIST, 'r') as test_items:
    test_list = test_items.readlines()
test_list = [item.strip() for item in test_list]

In [None]:
def train_test_split(train_test):
    dest = IMG_DIR
    if not os.path.exists(os.path.join(dest, train_test)):
        os.mkdir(os.path.join(dest, train_test))

    else:
        pass

def split_items(train_val_test_list, category):
    try:
        dest = os.path.join(IMG_DIR, category)
        for item in train_val_test_list:
            _from = os.path.join(IMG_DIR, item)
            
            folder = df.loc[df['Image Index'] == item, 'Finding Labels'].values[0]  # Disease
            count_labels = df.loc[df['Image Index'] == item,'labels_count'].values[0]
            
            if count_labels == 1: # keep only 1 disease:
                if not os.path.exists(os.path.join(dest, folder)):
                    os.mkdir(os.path.join(dest, folder))
                    shutil.copy(_from, os.path.join(dest, folder))
                else:
                    shutil.copy(_from, os.path.join(dest, folder))
    except Exception as e:
        print(e)

In [None]:
# Through function
train_test_split('train')
train_test_split('test')

split_items(train_list, 'train')
split_items(test_list, 'test')

In [None]:
print(glob(dest+'/train/*'))

# Model

In [None]:
# define function to load train, test, and validation datasets
def load_dataset(path, n_classes):
    """Returns the path and the Label from the folder"""
    data = load_files(path)
    chest_files = np.array(data['filenames'])
    chest_targets = np_utils.to_categorical(np.array(data['target']), n_classes)
    return chest_files, chest_targets

# load list of dog names
labels = [item[29:-1] for item in sorted(glob(dest+'/train/*/'))]
n_classes = len(labels)

# load train, test, and validation datasets
train_files, train_targets = load_dataset(dest+'/train', n_classes)
test_files, test_targets = load_dataset(dest+'/test', n_classes)

# Img size
img_width, img_height, channels = 224, 224, 3

#proportions
train_prop = np.count_nonzero(train_targets, axis=0) / len(train_targets)
test_prop = np.count_nonzero(test_targets, axis=0) / len(test_targets)

print('Proportions: \n')
for index, label in enumerate(labels):
    print('{} train: {:.4f}'.format(label, train_prop[index]*100))
    print('{} test: {:.4f}'.format(label, test_prop[index]*100))
    print('*********************')

print('\nStatistics about the Dataset:\n')
print('There are %d total chest deseases.' % len(labels))
print('There are %s total chest images.\n' % len(np.hstack([train_files, test_files])))
print('There are %d training chest images.' % len(train_files))
print('There are %d test chest images.'% len(test_files))

In [None]:
model_cnn = InceptionV3(weights='imagenet', include_top=False, input_shape=(img_height, img_width, channels))
model_top = Sequential()

model_top.add(Flatten(input_shape=model_cnn.output_shape[1:]))
model_top.add(Dense(512))
model_top.add(BatchNormalization())
model_top.add(Activation('relu'))
model_top.add(Dropout(0.25))

model_top.add(Dense(n_classes))
model_top.add(BatchNormalization())
model_top.add(Activation('softmax'))


model = Model(inputs=model_cnn.input, outputs=model_top(model_cnn.output))
adam = Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
model.compile(optimizer=adam, loss='categorical_crossentropy', metrics=['accuracy'])
print(model.summary())

In [None]:
train_data_dir = os.path.join(IMG_DIR, 'train')
validation_data_dir = os.path.join(IMG_DIR, 'test')
batch_size = 16
epochs = 50
nb_train_samples = len(train_files) // batch_size
nb_validation_samples = len(test_files) // batch_size
print('Layers: {}'.format(len(model.layers)))

In [None]:
# prepare data augmentation configuration
train_datagen = ImageDataGenerator(
    rescale=1. / 255,
    shear_range=0.2,
    zoom_range=0.2,
    width_shift_range=0.2,
    height_shift_range=0.2,
    horizontal_flip=True)

test_datagen = ImageDataGenerator(rescale=1. / 255)

# ------------------------------------------------------
# Generators
train_generator = train_datagen.flow_from_directory(
    train_data_dir,
    target_size=(img_height, img_width),
    batch_size=batch_size,
    class_mode='categorical')

validation_generator = test_datagen.flow_from_directory(
    validation_data_dir,
    target_size=(img_height, img_width),
    batch_size=batch_size,
    class_mode='categorical')

In [None]:
# Freeze Layers
for layer in model.layers[:310]:
    layer.trainable = False
    
def lr_schedule(epoch):
    """Change the learning rate """
    lrate = 0.001
    if epoch > 10:
        lrate = 0.0005
    if epoch > 30:
        lrate = 0.0003
    return lrate
    
# Fit the model    

History = model.fit_generator(train_generator, samples_per_epoch=nb_train_samples, epochs=epochs,
                    validation_data=validation_generator, nb_val_samples=nb_validation_samples,
                   callbacks=[LearningRateScheduler(lr_schedule)])

In [None]:
plt.figure(1, figsize=(10,10))  

# summarize history for accuracy  

plt.subplot(211)  
plt.plot(History.history['acc'])  
plt.plot(History.history['val_acc'])  
plt.title('Model Accuracy')  
plt.ylabel('Accuracy')  
plt.xlabel('Epoch')  
plt.legend(['train', 'val'], loc='upper left')  

# summarize history for loss  

plt.subplot(212)  
plt.plot(History.history['loss'])  
plt.plot(History.history['val_loss'])  
plt.title('Model Loss')  
plt.ylabel('Loss')  
plt.xlabel('Epoch')  
plt.legend(['train', 'val'], loc='upper left')  
plt.show()

In [None]:
# Predictions (1)
# if you forget to reset the test_generator you will get outputs in a weird order.
validation_generator.reset()
predictions1 = model.predict_generator(validation_generator, steps=nb_validation_samples)

#label with corresponding largest predictied probability
predictions1 = np.argmax(predictions1, axis=1)

print(predictions1)

In [None]:
# Predictions (3)

(eval_loss, eval_accuracy) = model.evaluate_generator(validation_generator,steps = batch_size, verbose=1)

print(eval_loss, eval_accuracy)

In [None]:
y_true = np.argmax(test_targets, axis=1)

In [None]:
y_true.shape, predictions1.shape

In [None]:
print(classification_report(y_true[:1632], predictions1, target_names=labels))