In [1]:
from class_dataset import ChestDataset
import pandas as pd
from keras.callbacks import TensorBoard, ModelCheckpoint,ReduceLROnPlateau
from keras.layers import Dense, Activation, Conv2D, MaxPool2D, Flatten, BatchNormalization, Dropout
from keras.preprocessing.image import ImageDataGenerator
from keras import applications
from keras.applications import DenseNet121
from keras import models
from keras import backend as K
from tensorflow.python.client import device_lib
import numpy as np
from sklearn.utils import class_weight
import matplotlib.pyplot as plt
# matplotlib.use('Agg')
from keras.optimizers import Adam
from keras.utils.np_utils import to_categorical
import os
from sklearn.preprocessing import LabelBinarizer,OneHotEncoder,MultiLabelBinarizer
from itertools import chain
from collections import Counter
from glob import glob
import tensorflow as tf
from random import shuffle
import keras
import cv2
from scipy import ndimage

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
print(device_lib.list_local_devices())
print(K.tensorflow_backend._get_available_gpus())
os.system('sudo chown -R ds:ds /data')
if not os.path.exists('./output/'):
    os.mkdir('output')

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 17107079016664964972
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 11287530701
locality {
  bus_id: 1
  links {
  }
}
incarnation: 3264332340083326695
physical_device_desc: "device: 0, name: Tesla K80, pci bus id: 0000:00:1e.0, compute capability: 3.7"
]
['/job:localhost/replica:0/task:0/device:GPU:0']


In [3]:
# CHOOSE now your model name 
model_name = 'densechest_multiclass_aug'

csvfile = 'data_kaggle/Data_Entry_2017.csv'
df = pd.read_csv(csvfile)

data_dir = '/data/xray_chest_final/'

ChestDataset(data_dir,df).reset_folder()

df = ChestDataset(data_dir,df).reader
df = df[df.exists == True]
df = df[~df['Finding Labels'].str.contains('\|')]

In [4]:
count = df['Finding Labels'].value_counts() 
labels_to_keep = count[count > 500].index.values
df = df[df['Finding Labels'].isin(labels_to_keep)]
df['labels'] = df['Finding Labels'].apply(lambda x: [x])
binarizer = MultiLabelBinarizer()
binarizer.fit(df.labels)
df['target'] = list(binarizer.transform(df.labels))
row_to_drop = [idx for i,idx in enumerate(df[df['Finding Labels'] == 'No Finding'].index.values) if i%3==0]
df = df.drop(row_to_drop)

In [5]:
dataset = ChestDataset(data_dir,df)

train_list = [el[len(data_dir):] for i,el in enumerate(dataset.image_path) if not i%5 == 0]
test_list = [el[len(data_dir):] for i,el in enumerate(dataset.image_path) if i%5 == 0]

In [6]:
with open('output/{}_train_list.txt'.format(model_name), 'w') as f:
    for item in train_list:
        f.write("%s\n" % item)

with open('output/{}_test_list.txt'.format(model_name), 'w') as f:
    for item in test_list:
        f.write("%s\n" % item)

train_dt,test_dt = dataset.train_test(train_list,test_list)
train_dt.create_tree()
test_dt.create_tree()

train_files = train_dt.image_path
test_files = test_dt.image_path
train_folder = train_dt.dir
test_folder = test_dt.dir

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.reader['exists'] = self.exists


In [7]:
for l in labels_to_keep:
    print(l + ' in Train: ', train_dt.labels.count(l)/len(train_dt))
    print(l + ' in Test: ', test_dt.labels.count(l)/len(test_dt))


No Finding in Train:  0.6742581090407177
No Finding in Test:  0.6859858938975775
Infiltration in Train:  0.12253661529023847
Infiltration in Test:  0.12634161300214658
Atelectasis in Train:  0.07000996856069319
Atelectasis in Test:  0.05519779208831647
Effusion in Train:  0.05666743347902768
Effusion in Test:  0.060410916896657466
Nodule in Train:  0.04455179817498658
Nodule in Test:  0.03925176326280282
Pneumothorax in Train:  0.031976075454336325
Pneumothorax in Test:  0.032812020852499235


In [8]:
# ADD YOUR MODEL
img_width,img_height = 256,256
densenet = DenseNet121(weights='imagenet', include_top=False,input_shape = (img_width, img_height, 3))

# # Freeze some layers
# for layer in densenet.layers[:100]:
#     layer.trainable = False
    
# Create the model
model = models.Sequential()

model.add(densenet)

# Add new layers
model.add(Flatten())
# model.add(Dense(72))
# model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(6, activation='softmax'))

# Show a summary of the model. Check the number of trainable parameters
model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
densenet121 (Model)          (None, 8, 8, 1024)        7037504   
_________________________________________________________________
flatten_1 (Flatten)          (None, 65536)             0         
_________________________________________________________________
activation_1 (Activation)    (None, 65536)             0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 65536)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 6)                 393222    
Total params: 7,430,726
Trainable params: 7,347,078
Non-trainable params: 83,648
_________________________________________________________________


In [17]:
def rotate(img,deg):
    return ndimage.rotate(img,deg)

def stdize(img):
    means = np.array([0.485, 0.456, 0.406])
    stds = np.array([0.229, 0.224, 0.225])
    for i in range(img.shape[2]):
        img[:,:,i] = (img[:,:,i] - means[i])/stds[i]
    return img

def get_input(path):
    img = cv2.imread(path)
    img = cv2.resize(img,(256,256))
    img = img/255.0
#     img = stdize(img)
    return img

def get_label(path):
    label = path.split('/')[-2]
    return label

def preprocess(img):
    batch = [img]
    
    if np.random.normal() > 0.5:
        img_aug = rotate(img,90)
        batch += [img_aug]
    if np.random.normal() > 0.5:
        img_aug = rotate(img,180)
        batch += [img_aug]
    if np.random.normal() > 0.5:
        img_aug = rotate(img,-90)    
        batch += [img_aug]
    if np.random.normal() > 0.5:
        img_aug = np.flipud(img)    
        batch += [img_aug]
    if np.random.normal() > 0.5:
        img_aug = np.fliplr(img)    
        batch += [img_aug]
    return batch


def ChestGen(files, batch_size = 16,augment = False):
    shuffle(files)
    idx = 0
    while True:
        # Select files (paths/indices) for the batch
        if idx + batch_size > len(files):
            batch_paths = files[idx:]
        else:
            batch_paths = files[idx:idx + batch_size]
        batch_input = []
        batch_output = [] 

        # Read in each input, perform preprocessing and get labels
        for input_path in batch_paths:
            x = get_input(input_path)
            y = get_label(input_path)
            vec = binarizer.transform([(y,)])
            batch_input += [x]
            batch_output += list(vec)

            if y != 'No Finding' and augment:
                batch_prep = preprocess(x)
                batch_input += batch_prep
                batch_output += list(vec)*len(batch_prep)
        # Return a tuple of (input,output) to feed the network
        batch_x = np.array(batch_input)
        batch_y = np.array(batch_output)
        
        idx += batch_size

        yield(batch_x, batch_y)

In [36]:
# serialize model to JSON
model_json = model.to_json()
with open("output/{}.json".format(model_name), "w") as json_file:
    json_file.write(model_json)
    

train_batchsize = 13
val_batchsize = 13

train_path = glob(data_dir +'train/**/*')
test_path = glob(data_dir + 'test/**/*')

train_generator = ChestGen(train_path,train_batchsize,augment=True)
test_generator = ChestGen(test_path,val_batchsize,augment=False)

In [18]:
# Compile the model
optimizer = Adam(lr=0.001)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['categorical_accuracy'])

tensorboard = TensorBoard(log_dir='output/logs', histogram_freq=0,
                          write_graph=True, write_images=False)
filepath = "output/checkpoint_{}.hdf5".format(model_name)
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
lr_sc = ReduceLROnPlateau(monitor='val_loss',factor=0.5,patience=3,verbose=1)


# Train the model
history = model.fit_generator(
    train_generator,
    steps_per_epoch=len(train_path) // train_batchsize,
    epochs=50,
    validation_data=test_generator,
    validation_steps=len(test_path) // val_batchsize,
    verbose=1,
    callbacks=[tensorboard,checkpoint,lr_sc])

Epoch 1/50


StopIteration: 

In [None]:
#metrics
fig = plt.figure(figsize=(12,6))
plt.subplot(121)
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')

plt.subplot(122)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
# plt.show()
fig.savefig('output/history_{}.png'.format(model_name))

In [None]:
# serialize weights to HDF5
model.save_weights("output/{}.h5".format(model_name))
print("Saved model to disk")

In [None]:
prediction = model.predict_generator(validation_generator,
                                     steps=len(validation_generator),
                                     pickle_safe=True,
                                     verbose=1)

In [None]:
preds = np.argmax(prediction,axis=1)

y_true = np.zeros((preds.shape[0],validation_generator.num_classes))
y_true[np.arange(preds.shape[0]), validation_generator.classes] = 1
inv_map = {v:k for k,v in validation_generator.class_indices.items()}
pred_cat = [inv_map[i] for i in preds]

report = classification_report(validation_generator.classes,preds)
np.save('output/report_{}.npy'.format(model_name),report)
print(report)
print('Accuracy score: ',accuracy_score(validation_generator.classes,preds))

In [None]:
score = model.evaluate_generator(validation_generator,
                                 steps=len(validation_generator),
                                 pickle_safe=True)
print('Accuracy Keras: ', score[1])

In [None]:
# Auc scores
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(validation_generator.num_classes):
    fpr[i], tpr[i], _ = roc_curve(y_true[:, i], prediction[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

fig = plt.figure(figsize=(15,10))
for i in range(validation_generator.num_classes):
    plt.plot(fpr[i], tpr[i],
             label='ROC curve of class {0} (area = {1:0.2f})'
             ''.format(inv_map[i], roc_auc[i]))
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC curve')
plt.legend(loc="lower right")
# plt.show()
fig.savefig('output/roc_curve_{}.png'.format(model_name))

print('End Of Training')