# Models for Larger Dataset

In [1]:
import os, sys
from keras import backend as K
from keras.models import Sequential, Model
from keras.layers import GlobalAveragePooling2D
from keras.layers.core import Dense, Activation, Dropout, Lambda
from keras.layers.convolutional import Conv2D
from keras.optimizers import SGD, RMSprop, Adam
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
import pickle

Using TensorFlow backend.


In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GroupShuffleSplit
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
path  =  "/nfs/turbo/intmed-bnallamo-turbo/wsliu/Data/colonoscopy2/"
model_path = path + 'models/'
if not os.path.exists(model_path): 
    os.mkdir(model_path)
data_path = path+'subset/'

In [4]:
batch_size =32

## Preparation
Check up the labels and image files. Split into training and validation sets. 

In [6]:
filenames = os.listdir(data_path)

In [5]:
labels = pd.read_excel(path+'colonoscopy_mayo_labeled_07May.xlsx')

In [15]:
labels.columns

Index(['basefile', 'pathname', 'SourceReportName', 'Inflamm_Mayo_0',
       'Inflamm_Mayo_1', 'Inflamm_Mayo_2', 'Inflamm_Mayo_3', 'Poor_GRAPHIC',
       'Poor_Image_UNCLASSIFIED'],
      dtype='object')

In [34]:
labeled = set(labels.basefile)

In [42]:
from shutil import copyfile

In [44]:
for f in filenames:
    if f in labeled:
        copyfile(data_path+f, path+'subset/'+f)

In [62]:
labels.Inflamm_Mayo_3.value_counts()

False    11666
True      1150
Name: Inflamm_Mayo_3, dtype: int64

In [7]:
labels['split0_123'] = (~labels.Inflamm_Mayo_0).astype(int)
labels['split01_23'] = (labels.Inflamm_Mayo_2 | labels.Inflamm_Mayo_3).astype(int)
labels['split012_3'] = labels.Inflamm_Mayo_3.astype(int)

In [73]:
labels.split0_123.value_counts()

0    7214
1    5602
Name: split0_123, dtype: int64

In [74]:
labels.split01_23.value_counts()

0    9961
1    2855
Name: split01_23, dtype: int64

In [8]:
labels.split012_3.value_counts()

0    11666
1     1150
Name: split012_3, dtype: int64

In [9]:
split = GroupShuffleSplit(n_splits=1, test_size=0.3)
ind = split.split(labels, groups=labels['SourceReportName'])
trn_ind, tst_ind = next(ind)

trn_patients = set(labels.loc[trn_ind, 'SourceReportName'])
tst_patients = set(labels.loc[tst_ind, 'SourceReportName'])

trn_df = labels.loc[trn_ind, ]
tst_df = labels.loc[tst_ind, ]

In [10]:
len(trn_patients), len(tst_patients)

(1615, 693)

In [12]:
tst_df.split012_3.value_counts()

0    3533
1     389
Name: split012_3, dtype: int64

## Data Generator

In [11]:
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [12]:
from keras_addon import ImageFrameGenerator

In [13]:
train_gen = ImageFrameGenerator( 
        rotation_range=180,
        width_shift_range=0.2,
        height_shift_range=0.2,
        shear_range=0.1,
        zoom_range=0.2,
        horizontal_flip=True,
        vertical_flip=True,
        fill_mode='nearest')

In [14]:
test_gen = ImageFrameGenerator()

In [15]:
train_itr = train_gen.flow_from_frame(data_path, trn_df, 'basefile', ['split0_123', 'split01_23', 'split012_3'], 
                                     target_size=(256, 256), batch_size=batch_size)

TypeError: _count_valid_files_in_directory() missing 1 required positional argument: 'follow_links'

In [19]:
test_itr = test_gen.flow_from_frame(data_path, tst_df, 'basefile', ['split0_123', 'split01_23', 'split012_3'], 
                                     target_size=(256, 256), batch_size=batch_size)

Found 115729 images in the directory.
Using 3922 images to generate mini-batches.


## Model Building

In [20]:
from keras.applications.inception_v3 import InceptionV3

In [25]:
import tensorflow as tf
import keras

In [26]:
keras.__version__

'2.1.2'

In [22]:
base_model = InceptionV3(weights='imagenet', include_top=False)

InternalError: Failed to create session.

In [21]:
x = base_model.output
x = GlobalAveragePooling2D()(x)
predictions = Dense(3, activation='sigmoid')(x)
model = Model(inputs=base_model.input, outputs=predictions)

InternalError: Failed to create session.