In [None]:
# main script

#import libraries for data processing and analysis
import pandas as pd
import numpy as np

In [None]:
#load in the training image labels
labels = pd.read_csv("labels.csv")
print(labels.shape)
print(labels.head())

In [None]:
#set the targets to be the breed associated with each label while storing the corresponding ids separately
targets = labels["breed"]
print(targets.head())
filenames = labels["id"]
print(filenames.head())

In [None]:
#create a dataframe of target breeds indexed by filename
values = np.array(targets)
indices = np.array(filenames)
label_df = pd.DataFrame(values,index = indices)
print(label_df.head())

In [None]:
#import components of the neural network
from keras.models import Sequential
from keras.layers.convolutional import Conv2D
from keras.layers.convolutional import MaxPooling2D
from keras.layers import Dropout
from keras.constraints import maxnorm
from keras.layers.core import Activation
from keras.layers.core import Flatten
from keras.layers.core import Dense
from keras.layers import GlobalMaxPooling2D
from keras import backend as K

In [None]:
#find out the number of different breeds
num_classes = len(targets.unique())
print(num_classes)

In [None]:
#create a list to store the breed associated with each image since the order of file loading 
#isn't consistent with the filenames
actual_labels = []

In [None]:
#the images are of different sizes so determine a common size to reshape everything to
desired_image_size = (128,128)

In [None]:
#load training images
from os import listdir
from os.path import isfile, join
import numpy
import cv2
from keras.preprocessing.image import img_to_array

mypath='train'
onlyfiles = [ f for f in listdir(mypath) if isfile(join(mypath,f)) ]
train_images = numpy.empty(len(onlyfiles), dtype=object)
for n in range(0, len(onlyfiles)):
    path = onlyfiles[n].split(".")[0]
    lab = label_df.get_value(path,0)
    actual_labels.append(lab)
    new = cv2.imread(join(mypath,onlyfiles[n]))
    new = cv2.resize(new,desired_image_size)
    train_images[n] = img_to_array(new)/255.0

In [None]:
actual_labels = np.array(actual_labels)

In [None]:
#create a list to store the filenames of each testing image for later use
submission_indices = []

In [None]:
#load testing images
from os import listdir
from os.path import isfile, join
import numpy
import cv2
from keras.preprocessing.image import img_to_array

mypath='test'
onlyfiles = [ f for f in listdir(mypath) if isfile(join(mypath,f)) ]
test_images = numpy.empty(len(onlyfiles), dtype=object)
for n in range(0, len(onlyfiles)):
    path_test = onlyfiles[n].split(".")[0]
    submission_indices.append(path_test)
    new = cv2.imread(join(mypath,onlyfiles[n]))
    new = cv2.resize(new,desired_image_size)
    test_images[n] = img_to_array(new)/255.0

In [None]:
submission_indices = np.array(submission_indices)

In [None]:
#generate new training data to make the model more robust to shifts or other transformations
from keras.preprocessing.image import ImageDataGenerator
adder = ImageDataGenerator(rotation_range=30, width_shift_range=0.1, height_shift_range=0.1, shear_range=0.2, 
                           zoom_range=0.2, horizontal_flip=True, fill_mode="nearest")

In [None]:
train_images = train_images
target_train = actual_labels
test_images = test_images

In [None]:
#map the dog breeds from a string to a number
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
numeric_labels = le.fit_transform(target_train)

In [None]:
#one hot encode the numeric dog breed labels
num_classes_sample = len(np.unique(numeric_labels))
from keras.utils import to_categorical
train_labels = to_categorical(numeric_labels,num_classes_sample)

In [None]:
#create the desired model architecture based on available computational resources

input_shape = (desired_image_size[0],desired_image_size[1],3)


#model = Sequential()
#model.add(Conv2D(20,(5,5),padding = "same",activation = "relu",input_shape = input_shape))
#model.add(MaxPooling2D(pool_size = (2,2),strides = (2,2)))
#model.add(Conv2D(50,(5,5),padding = "same",activation = "relu"))
#model.add(MaxPooling2D(pool_size = (2,2),strides = (2,2)))
#model.add(Flatten())
#model.add(Dense(500,activation = "relu"))
#model.add(Dense(num_classes_sample,activation = "softmax"))


model = Sequential()
model.add(Conv2D(32, (3, 3), input_shape = input_shape, activation='relu', padding='same'))
model.add(Dropout(0.2))
model.add(Conv2D(32, (3, 3), activation='relu', padding='same'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(64, (3, 3), activation='relu', padding='same'))
model.add(Dropout(0.2))
model.add(Conv2D(64, (3, 3), activation='relu', padding='same'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(128, (3, 3), activation='relu', padding='same'))
model.add(Dropout(0.2))
model.add(Conv2D(128, (3, 3), activation='relu', padding='same'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Flatten())
model.add(Dropout(0.2))
model.add(Dense(512, activation='relu', kernel_constraint=maxnorm(3)))
model.add(Dropout(0.2))
model.add(Dense(256, activation='relu', kernel_constraint=maxnorm(3)))
model.add(Dropout(0.2))
model.add(Dense(num_classes, activation='softmax'))

In [None]:
#compile the model
model.compile(loss = "categorical_crossentropy",optimizer = "adam",metrics = ["accuracy"])

In [None]:
#print out a synopsis of the model architecture
print(model.summary())

In [None]:
#preprocess the training images so that they can be fed into the network
(dim2,dim3,dim4) = train_images[0].shape
num_images = train_images.size
training_data = np.zeros((num_images,dim2,dim3,dim4))
for i in range(num_images):
    training_data[i,:,:,:] = train_images[i]

In [None]:
#fit the model
bs = 256
A = model.fit_generator(adder.flow(training_data, train_labels, batch_size=bs), 
                        steps_per_epoch=num_images // bs,epochs=5000)

In [None]:
#save the model so that predictions can be made on individual images later without retraining
model.save("my_model.h5")

In [None]:
#preprocess the test images so that they can be fed into the network
(dim2,dim3,dim4) = test_images[0].shape
num_test_images = test_images.size
practice_test_images = np.zeros((num_test_images,dim2,dim3,dim4))
for i in range(num_test_images):
    practice_test_images[i,:,:,:] = test_images[i]

In [None]:
#generate predictions (as probabilities associated with each class)
predictions = model.predict(practice_test_images)
print(predictions.shape)
print(predictions[0])

In [None]:
#determine the numeric labels of the predictions
number_labels_back = np.argmax(predictions,axis = 1)
print(len(number_labels_back))

In [None]:
print(number_labels_back[0])

In [None]:
#find the probability associated with each prediction
probabilities = np.amax(predictions,axis = 1)

In [None]:
#map the predicted breeds back to strings
final_prediction_strings = le.inverse_transform(number_labels_back)
print(final_prediction_strings)

In [None]:
for i in range(30):
    print("Test image %d is of a(n) %s with probability %f" %(i,final_prediction_strings[i],probabilities[i]))

In [None]:
#save results in a dataframe
numbers = np.array([i for i in range(num_classes_sample)])
column_names = le.inverse_transform(numbers)
final_submission = pd.DataFrame(predictions,columns = column_names, index = submission_indices)

In [None]:
final_submission.to_csv("dog_test_predictions.csv")

In [None]:
print(final_submission.head())