In [1]:
from class_dataset import ChestDataset
import pandas as pd
from keras.layers import Dense, Activation, Conv2D, MaxPool2D, Flatten, BatchNormalization, Dropout
from keras.preprocessing.image import ImageDataGenerator
from keras import applications
from keras.applications import DenseNet121
from keras import models
import tensorflow as tf

Using TensorFlow backend.


In [None]:
!sudo chown -R ds:ds /data

In [2]:
csvfile = 'data_kaggle/Data_Entry_2017.csv'
df = pd.read_csv(csvfile)

data_dir = '/data/xray_chest_final/'

ChestDataset(data_dir,df).reset_folder()

df_uni = ChestDataset(data_dir,df[~df['Finding Labels'].str.contains('\|')]).reader
df_uni = df_uni[df_uni.exists == True]

df_rd = df_uni.groupby('Finding Labels',group_keys=False).apply(lambda df: df.sample(67))

dataset = ChestDataset(data_dir,df_rd)

train_list = [el[16:] for i,el in enumerate(dataset.image_path) if not i%3 == 0]
test_list = [el[16:] for i,el in enumerate(dataset.image_path) if i%3 == 0]

with open('dense_train_list.txt', 'w') as f:
    for item in train_list:
        f.write("%s\n" % item)

with open('dense_test_list.txt', 'w') as f:
    for item in test_list:
        f.write("%s\n" % item)

# train_dt,test_dt = dataset.train_test(train_list,test_list)
# train_dt.create_tree()
# test_dt.create_tree()

In [7]:
train_files = train_dt.image_path
test_files = test_dt.image_path
train_folder = train_dt.dir
test_folder = test_dt.dir

In [8]:
label_train = [train_dt.labels[i] for i, el in enumerate(train_dt.exists) if el == True]
label_test = [test_dt.labels[i] for i, el in enumerate(test_dt.exists) if el == True]
print('Train # No Finding:',label_train.count('No Finding')/len(label_train))
print('Test # No Finding:',label_test.count('No Finding')/len(label_test))

Train # No Finding: 0.7090673094752457
Test # No Finding: 0.5034446210916799


In [9]:
labels = set(dataset.labels)
print('Statistics about the Dataset:\n')
print('There are %d total chest deseases.' % len(set(dataset.labels)))
print('There are %s total chest images.\n' % np.sum(dataset.exists))
print('There are %d training chest images.' % np.sum(train_dt.exists))
# print('There are %d validation dog images.' % len(valid_files))
print('There are %d test chest images.'% np.sum(test_dt.exists))
for lab in labels:
    print('# of %s: %.3f%%'%(lab,100*dataset.labels.count(lab)/len(dataset.labels)))

Statistics about the Dataset:

There are 15 total chest deseases.
There are 12673 total chest images.

There are 10786 training chest images.
There are 1887 test chest images.
# of Infiltration: 8.672%
# of Mass: 1.941%
# of Effusion: 3.803%
# of Hernia: 0.221%
# of Fibrosis: 1.452%
# of Emphysema: 0.955%
# of Pneumothorax: 1.712%
# of Consolidation: 1.499%
# of Pneumonia: 0.308%
# of Atelectasis: 4.569%
# of Pleural_Thickening: 1.278%
# of Edema: 0.529%
# of No Finding: 67.845%
# of Nodule: 3.133%
# of Cardiomegaly: 2.083%


In [10]:
img_width,img_height = 365,365
densenet = DenseNet121(weights='imagenet', include_top=False,input_shape = (img_width, img_height, 3))

# # Freeze some layers
# for layer in densenet.layers[:]:
#     layer.trainable = False
    
# Create the model
model = models.Sequential()

# Add the vgg convolutional base model

model.add(densenet)

# Add new layers
model.add(Flatten())
# model.add(Dense(72))
# model.add(BatchNormalization())
# model.add(Activation('relu'))
# model.add(Dropout(0.248))
model.add(Dense(15, activation='softmax'))

# Show a summary of the model. Check the number of trainable parameters
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
densenet121 (Model)          (None, 11, 11, 1024)      7037504   
_________________________________________________________________
flatten_1 (Flatten)          (None, 123904)            0         
_________________________________________________________________
dense_1 (Dense)              (None, 15)                1858575   
Total params: 8,896,079
Trainable params: 1,858,575
Non-trainable params: 7,037,504
_________________________________________________________________


In [None]:
train_datagen = ImageDataGenerator()
validation_datagen = ImageDataGenerator()

# Change the batchsize according to your system RAM
train_batchsize = 10
val_batchsize = 10

train_generator = train_datagen.flow_from_directory(
    train_folder,
    target_size=(img_height, img_width),
    batch_size=train_batchsize,
    class_mode='categorical')

validation_generator = validation_datagen.flow_from_directory(
    test_folder,
    target_size=(img_height, img_width),
    batch_size=val_batchsize,
    class_mode='categorical',
    shuffle=False)

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
history = model.fit_generator(
    train_generator,
    steps_per_epoch=train_generator.samples // train_generator.batch_size,
    epochs=2,
    validation_data=validation_generator,
    validation_steps=validation_generator.samples // validation_generator.batch_size,
    verbose=1,
    use_multiprocessing=True)

In [None]:
prediction = model.predict_generator(validation_generator,
                                     steps=len(validation_generator),
                                     pickle_safe=True,
                                     verbose=1)

In [None]:
preds = np.argmax(prediction,axis=1)
print(preds.shape)

y_true = np.zeros((preds.shape[0],validation_generator.num_classes))
y_true[np.arange(preds.shape[0]), validation_generator.classes] = 1
inv_map = {v:k for k,v in validation_generator.class_indices.items()}
pred_cat = [inv_map[i] for i in preds]

print(classification_report(validation_generator.classes,preds))
print('Accuracy score: ',accuracy_score(validation_generator.classes,preds))

In [None]:
score = model.evaluate_generator(validation_generator,
                                 steps=len(validation_generator),
                                 pickle_safe=True,
                                 verbose=1)
print('Accuracy Keras: ', score[1])

In [None]:
# Auc scores
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(validation_generator.num_classes):
    fpr[i], tpr[i], _ = roc_curve(y_true[:, i], prediction[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

plt.figure(figsize=(15,10))
for i in range(validation_generator.num_classes):
    plt.plot(fpr[i], tpr[i],
             label='ROC curve of class {0} (area = {1:0.2f})'
             ''.format(inv_map[i], roc_auc[i]))
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC curve')
plt.legend(loc="lower right")
plt.show()