In [1]:
import numpy as np
import pandas as pd
import bcolz
import time
import logging
import datetime

import sys
sys.path.append('..')

from bcolzutils import *
from util import *

import keras.backend as K
from keras.models import Sequential, Model
from keras.layers import Dropout, Flatten, Dense, GlobalAveragePooling2D
from keras import regularizers
from keras.callbacks import EarlyStopping, ModelCheckpoint, CSVLogger, LearningRateScheduler
from keras import optimizers
from keras.regularizers import l2 

from keras.applications.vgg19 import VGG19
from keras.applications.vgg19 import preprocess_input as vgg19_preprocess_input


import tensorflow as tf
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config = config)

Using TensorFlow backend.
  return f(*args, **kwds)


In [2]:
arch = "vgg19"

percent = 0.25
#percent = 1
epochs=15
num_classes = 133
batch_size = 64
lr=1e-3
momentum=0.9
weight_decay = 1e-5
test_prefix=""

def lr_schedule(epoch):
    """ divides the lr by 10 every 5 epochs"""
    n = epoch // 5
    return lr * (0.1 ** n)

if percent < 1:
    test_prefix = "_test"
    
test_result = f'bottleneck_features_{arch}_result{test_prefix}.npz'
model_path = f'../saved_models/weights.best.topmodel.{arch}{test_prefix}.hdf5'
loss_history_csv_name = f'train_top_model_{arch}_loss_history{test_prefix}.csv'

d = datetime.datetime.today()

logging.basicConfig(level='DEBUG',
                    handlers=[
                              logging.StreamHandler()])
log = logging.getLogger(__name__)

basedir="/home/tutysara/src/myprojects/dog-project/dogImages"

train_idx_path = basedir+ "/train_list.txt"
valid_idx_path = basedir+ "/valid_list.txt"
test_idx_path = basedir+ "/test_list.txt"

train_name = basedir + '/train'
valid_name = basedir + '/valid'
test_name = basedir + '/test'

In [3]:
%ls -l {basedir}

total 532
drwxrwxr-x   4 tutysara tutysara   4096 Mar  2 20:00 [0m[01;34mbottleneck_features_vgg19_test_data.bclz[0m/
drwxrwxr-x   4 tutysara tutysara   4096 Mar  2 20:00 [01;34mbottleneck_features_vgg19_test_labels.bclz[0m/
drwxrwxr-x   4 tutysara tutysara   4096 Mar  2 20:12 [01;34mbottleneck_features_vgg19_test_y_pred.bclz[0m/
drwxrwxr-x   4 tutysara tutysara   4096 Mar  2 20:12 [01;34mbottleneck_features_vgg19_test_y_true.bclz[0m/
drwxrwxr-x   4 tutysara tutysara   4096 Mar  2 20:00 [01;34mbottleneck_features_vgg19_train_data.bclz[0m/
drwxrwxr-x   4 tutysara tutysara   4096 Mar  2 20:00 [01;34mbottleneck_features_vgg19_train_labels.bclz[0m/
drwxrwxr-x   4 tutysara tutysara   4096 Mar  2 20:00 [01;34mbottleneck_features_vgg19_valid_data.bclz[0m/
drwxrwxr-x   4 tutysara tutysara   4096 Mar  2 20:00 [01;34mbottleneck_features_vgg19_valid_labels.bclz[0m/
drwxr-xr-x 135 tutysara tutysara   4096 Mar 27  2017 [01;34mtest[0m/
drwxrwxr-x   4 tutysara tutysara   4096 Mar  

In [4]:
# alternate image loading
# define function to load train, test, and validation datasets
from sklearn.datasets import load_files 
def load_dataset(path):
    data = load_files(path)
    dog_files = np.array(data['filenames'])
    dog_targets = np_utils.to_categorical(np.array(data['target']), 133)
    return dog_files, dog_targets

# load train, test, and validation datasets
valid_files, valid_targets = load_dataset(basedir + '/../' +'dogImages/valid')
test_files, test_targets = load_dataset(basedir + '/../' +'dogImages/test')
train_files, train_targets = load_dataset(basedir + '/../' +'dogImages/train')

In [5]:
import random
indices = random.sample(range(len(train_files)), 15)

In [6]:
print(train_files[indices])
print(np.argmax(train_targets[indices], axis=1))

[ '/home/tutysara/src/myprojects/dog-project/dogImages/../dogImages/train/082.Havanese/Havanese_05609.jpg'
 '/home/tutysara/src/myprojects/dog-project/dogImages/../dogImages/train/046.Cavalier_king_charles_spaniel/Cavalier_king_charles_spaniel_03298.jpg'
 '/home/tutysara/src/myprojects/dog-project/dogImages/../dogImages/train/056.Dachshund/Dachshund_03952.jpg'
 '/home/tutysara/src/myprojects/dog-project/dogImages/../dogImages/train/018.Beauceron/Beauceron_01324.jpg'
 '/home/tutysara/src/myprojects/dog-project/dogImages/../dogImages/train/109.Norwegian_elkhound/Norwegian_elkhound_07151.jpg'
 '/home/tutysara/src/myprojects/dog-project/dogImages/../dogImages/train/029.Border_collie/Border_collie_02005.jpg'
 '/home/tutysara/src/myprojects/dog-project/dogImages/../dogImages/train/064.English_toy_spaniel/English_toy_spaniel_04520.jpg'
 '/home/tutysara/src/myprojects/dog-project/dogImages/../dogImages/train/075.Glen_of_imaal_terrier/Glen_of_imaal_terrier_05141.jpg'
 '/home/tutysara/src/myproj

In [7]:
valid_data_size = int(valid_files.shape[0]*percent)
test_data_size = int(test_files.shape[0]*percent)
train_data_size = int(train_files.shape[0]*percent)

if percent < 1:
    valid_files = valid_files[:valid_data_size]
    valid_targets = valid_targets[:valid_data_size]
    
    test_files = test_files[:test_data_size]
    test_targets = test_targets[:test_data_size]
    
    train_files = train_files[:train_data_size]
    train_targets = train_targets[:train_data_size]

In [8]:
print(valid_files.shape, valid_targets.shape)
print(test_files.shape, test_targets.shape)
print(train_files.shape, train_targets.shape)

(208,) (208, 133)
(209,) (209, 133)
(1670,) (1670, 133)


In [9]:
# convert and load images
from keras.preprocessing import image                  
from tqdm import tqdm
# load and preprocess data
from PIL import ImageFile                            
ImageFile.LOAD_TRUNCATED_IMAGES = True  

def path_to_tensor(img_path):
    # loads RGB image as PIL.Image.Image type
    img = image.load_img(img_path, target_size=(224, 224))
    # convert PIL.Image.Image type to 3D tensor with shape (224, 224, 3)
    x = image.img_to_array(img)
    # convert 3D tensor to 4D tensor with shape (1, 224, 224, 3) and return 4D tensor
    return np.expand_dims(x, axis=0)

def paths_to_tensor(img_paths):
    list_of_tensors = [path_to_tensor(img_path) for img_path in tqdm(img_paths)]
    return np.vstack(list_of_tensors)

# pre-process the data for Keras
valid_tensors = paths_to_tensor(valid_files).astype('float32')
test_tensors = paths_to_tensor(test_files).astype('float32')
train_tensors = paths_to_tensor(train_files).astype('float32')

100%|██████████| 208/208 [00:02<00:00, 91.04it/s] 
100%|██████████| 209/209 [00:01<00:00, 113.14it/s]
100%|██████████| 1670/1670 [00:17<00:00, 94.35it/s] 


In [10]:
valid_data = vgg19_preprocess_input(valid_tensors, mode='caffe')
test_data = vgg19_preprocess_input(test_tensors, mode='caffe')
train_data = vgg19_preprocess_input(train_tensors, mode='caffe')

In [11]:
# Generate a model with all layers (with top)
vgg19 = VGG19(weights='imagenet', include_top=True)

#Add a layer where input is the output of the  second last layer 
x = Dense(num_classes, activation='softmax', name='my_predictions')(vgg19.layers[-2].output)

for layer in vgg19.layers:
    layer.trainable = False
    
#Then create the corresponding model 
my_model = Model(input=vgg19.input, output=x)
my_model.layers[-3].trainable = True
my_model.layers[-2].trainable = True
my_model.layers[-1].trainable = True
#my_model.summary()

  # This is added back by InteractiveShellApp.init_path()


In [12]:
for layer in my_model.layers:
    print(layer.name, layer.trainable)

input_1 False
block1_conv1 False
block1_conv2 False
block1_pool False
block2_conv1 False
block2_conv2 False
block2_pool False
block3_conv1 False
block3_conv2 False
block3_conv3 False
block3_conv4 False
block3_pool False
block4_conv1 False
block4_conv2 False
block4_conv3 False
block4_conv4 False
block4_pool False
block5_conv1 False
block5_conv2 False
block5_conv3 False
block5_conv4 False
block5_pool False
flatten False
fc1 True
fc2 True
my_predictions True


In [13]:
checkpointer = ModelCheckpoint(filepath=model_path, verbose=1, save_best_only=True)
early_stopping = EarlyStopping(monitor='val_loss', patience=4, verbose=1)
csv_logger = CSVLogger(loss_history_csv_name, append=True, separator=',')
lrscheduler = LearningRateScheduler(schedule=lr_schedule)

my_model.compile(loss='categorical_crossentropy',
              optimizer=optimizers.SGD(lr=lr, momentum=momentum),
              metrics=['accuracy'])
my_model.fit( train_data, train_targets,
          epochs=epochs,
          validation_data=(valid_data, valid_targets),
          callbacks=[early_stopping, lrscheduler])

Train on 1670 samples, validate on 208 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15

KeyboardInterrupt: 

In [None]:
vgg19_feature_ext = VGG19(include_top=False, weights='imagenet', input_shape=(224, 224, 3))


bottleneck_features_validation = vgg19_feature_ext.predict(vgg19_preprocess_input(valid_tensors))
bottleneck_features_test = vgg19_feature_ext.predict(vgg19_preprocess_input(test_tensors))
bottleneck_features_train = vgg19_feature_ext.predict(vgg19_preprocess_input(train_tensors))

In [None]:
bottleneck_features_test.shape

In [None]:
classes = num_classes

  
top_model = Sequential()
top_model.add(GlobalAveragePooling2D(input_shape=(7, 7, 512)))
#top_model.add(Flatten(input_shape=(7, 7, 512)))
top_model.add(Dense(4096, activation='relu', name='fc1'))
top_model.add(Dropout(0.5, name='fc1-dropout'))
top_model.add(Dense(4096, activation='relu', name='fc2'))
top_model.add(Dropout(0.5, name='fc2-dropout'))
top_model.add(Dense(classes, activation='softmax', name='predictions'))

#top_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

top_model.compile(loss='categorical_crossentropy',
              optimizer=optimizers.SGD(lr=lr, momentum=momentum),
              metrics=['accuracy'])

In [None]:
checkpointer = ModelCheckpoint(filepath=model_path, verbose=1, save_best_only=True)
early_stopping = EarlyStopping(monitor='val_loss', patience=4, verbose=1)
csv_logger = CSVLogger(loss_history_csv_name, append=True, separator=',')
lrscheduler = LearningRateScheduler(schedule=lr_schedule)

top_model.fit(bottleneck_features_train, train_targets,
          epochs=epochs,
          validation_data=(bottleneck_features_validation, valid_targets),
          callbacks=[early_stopping, lrscheduler])

In [None]:
K.clear_session()

In [None]:
#2 read data and convert to tensor
col_names = ["X", "y"]
train_data_df = pd.read_csv(train_idx_path, sep=" ", header=None, names=col_names)
valid_data_df = pd.read_csv(valid_idx_path, sep=" ", header=None, names=col_names)
test_data_df = pd.read_csv(test_idx_path, sep=" ", header=None, names=col_names)

train_data_df = train_data_df[:int(train_data_df.shape[0]*percent)]
valid_data_df = valid_data_df[:int(valid_data_df.shape[0]*percent)]
test_data_df = test_data_df[:int(test_data_df.shape[0]*percent)]

train_data_df.y = train_data_df.y-1
valid_data_df.y = valid_data_df.y-1
test_data_df.y = test_data_df.y-1

valid_files = valid_data_df.X.apply(lambda x: basedir+"/"+x)
test_files = test_data_df.X.apply(lambda x: basedir+"/"+x)
train_files = train_data_df.X.apply(lambda x: basedir+"/"+x)

valid_labels = np_utils.to_categorical(valid_data_df.y, num_classes) 
test_labels = np_utils.to_categorical(test_data_df.y, num_classes)
train_labels = np_utils.to_categorical(train_data_df.y, num_classes)

In [None]:
print(train_files[50:75].values)
print(train_data_df[50:75].y.values)

In [None]:
print(valid_files.shape, valid_labels.shape)
print(test_files.shape, test_labels.shape)
print(train_files.shape, train_labels.shape)

In [None]:
checkpointer = ModelCheckpoint(filepath=model_path, verbose=1, save_best_only=True)
early_stopping = EarlyStopping(monitor='val_loss', patience=4, verbose=1)
csv_logger = CSVLogger(loss_history_csv_name, append=True, separator=',')
lrscheduler = LearningRateScheduler(schedule=lr_schedule)

my_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

my_model.fit(train_data, train_labels,
          epochs=epochs,
          validation_data=(valid_data, valid_labels),
          callbacks=[early_stopping])

In [None]:
#3 read bcolz data
valid_data = bcolz.carray(rootdir= valid_name+'_data.bclz', mode='r')
test_data = bcolz.carray(rootdir= test_name + '_data.bclz', mode='r')
train_data = bcolz.carray(rootdir= train_name+ '_data.bclz', mode='r')


valid_labels = bcolz.carray(rootdir= valid_name+'_labels.bclz', mode='r')
test_labels = bcolz.carray(rootdir= test_name + '_labels.bclz', mode='r')
train_labels = bcolz.carray(rootdir= train_name+ '_labels.bclz', mode='r')

print(valid_data.shape, valid_labels.shape)
print(test_data.shape, test_labels.shape) 
print(train_data.shape, train_labels.shape) 

In [None]:
# Generate a model with all layers (with top)
vgg19 = VGG19(weights='imagenet', include_top=True)

#Add a layer where input is the output of the  second last layer 
x = Dense(num_classes, activation='softmax', name='my_predictions')(vgg19.layers[-2].output)

for layer in vgg19.layers:
    layer.trainable = False
    
#Then create the corresponding model 
my_model = Model(input=vgg19.input, output=x)
my_model.layers[-1].trainable = True
my_model.layers[-2].trainable = True
my_model.layers[-3].trainable = True
my_model.summary()

In [None]:
for layer in my_model.layers:
    print(layer.name, layer.trainable)

In [None]:
print(valid_data.shape, valid_labels.shape)
print(test_data.shape, test_labels.shape) 
print(train_data.shape, train_labels.shape) 

In [None]:
train_gen =bcolz_data_generator(train_data, train_labels, batch_size=batch_size, preprocess=vgg19_preprocess_input)
valid_gen =bcolz_data_generator(valid_data, valid_labels, batch_size=batch_size, preprocess=vgg19_preprocess_input)
test_gen =bcolz_data_generator(test_data, test_labels, batch_size=batch_size, preprocess=vgg19_preprocess_input)

In [None]:
tX, ty = next(train_gen)

In [None]:
tX.shape

In [None]:
valid_data.shape

In [None]:
vgg19_preprocess_input(tX, mode='caffe').shape

In [None]:
valid_data.shape

In [None]:
checkpointer = ModelCheckpoint(filepath=model_path, verbose=1, save_best_only=True)
early_stopping = EarlyStopping(monitor='val_loss', patience=4, verbose=1)
csv_logger = CSVLogger(loss_history_csv_name, append=True, separator=',')
lrscheduler = LearningRateScheduler(schedule=lr_schedule)

my_model.compile(loss='categorical_crossentropy',
              optimizer=optimizers.SGD(lr=lr, momentum=momentum),
              metrics=['accuracy'])

my_model.fit_generator(train_gen,
          steps_per_epoch= (1 + int(train_data.shape[0] // batch_size)),
          epochs=epochs,
          validation_data=valid_gen,
          validation_steps= (1 + int(valid_data.shape[0] // batch_size)),
          callbacks=[early_stopping, lrscheduler])

In [None]:
valid_data2 = vgg19_preprocess_input(valid_data, mode='caffe')
test_data2 = vgg19_preprocess_input(test_data, mode='caffe')
train_data2 = vgg19_preprocess_input(train_data, mode='caffe')

In [None]:
checkpointer = ModelCheckpoint(filepath=model_path, verbose=1, save_best_only=True)
early_stopping = EarlyStopping(monitor='val_loss', patience=4, verbose=1)
csv_logger = CSVLogger(loss_history_csv_name, append=True, separator=',')
lrscheduler = LearningRateScheduler(schedule=lr_schedule)

my_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

my_model.fit(train_data, train_labels,
          epochs=epochs,
          validation_data=(valid_data, valid_labels),
          callbacks=[early_stopping, lrscheduler])