## Dog Breed Identification - Kaggle Submission Exercise

In [80]:
import os
import cv2 
from keras.models import Sequential 
from keras.layers import Dense, Dropout 
from keras.layers import Conv2D 
from keras.layers import MaxPooling2D 
from keras.layers import Flatten
from keras.layers.normalization import BatchNormalization
from keras.layers import Activation
import numpy as np 
import pandas as pd 
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt 
import sklearn
from sklearn.cross_validation import train_test_split

#### References
- Su, C. J. (2018). CNN from scratch. Retrieved from https://www.kaggle.com/carmnejsu/cnn-from-scratch
- Kostadinov, N. (2017). Dog Breed Classification with Keras. Retrieved from http://machinememos.com/python/keras/artificial%20intelligence/machine%20learning/transfer%20learning/dog%20breed/neural%20networks/convolutional%20neural%20network/tensorflow/image%20classification/imagenet/2017/07/11/dog-breed-image-classification.html

## Preprocessing

In [81]:
lables = pd.read_csv("C:\\Users\\Bill\\.kaggle\\competitions\\dog-breed-identification\\labels.csv")
lables.head()

Unnamed: 0,id,breed
0,000bec180eb18c7604dcecc8fe0dba07,boston_bull
1,001513dfcb2ffafc82cccf4d8bbaba97,dingo
2,001cdf01b096e06d78e9e5112d419397,pekinese
3,00214f311d5d2247d5dfe4fe24b2303d,bluetick
4,0021f9ceb3235effd7fcde7f7538ed62,golden_retriever


In [82]:
breeds = lables['breed'].value_counts()
breeds.shape

(120,)

In [83]:
breeds.head(10)

scottish_deerhound      126
maltese_dog             117
afghan_hound            116
entlebucher             115
bernese_mountain_dog    114
shih-tzu                112
great_pyrenees          111
pomeranian              111
basenji                 110
samoyed                 109
Name: breed, dtype: int64

In [64]:
img_size=100
num_channel=1# 3 colour channes

### Create one hot labels

In [27]:
targets = pd.Series(lables['breed'])
one_hot = pd.get_dummies(targets, sparse = True)
one_hot_labels = np.asarray(one_hot)

## Loop image resize process on train and test sets
- initialize, format jpg, and resize images
- Check shape

In [65]:
x_feature = []
y_feature = []

i = 0 
for f, img in tqdm(lables.values): # f for format ,jpg
    train_img = cv2.imread('C:\\Users\\Bill\\.kaggle\\competitions\\dog-breed-identification\\train/{}.jpg'.format(f),0)
    label = one_hot_labels[i]
    train_img_resize = cv2.resize(train_img, (img_size, img_size)) 
    x_feature.append(train_img_resize)
    y_feature.append(label)
    i += 1

100%|███████████████████████████████████| 10222/10222 [00:16<00:00, 624.07it/s]


In [66]:
x_train_array = np.array(x_feature, np.float32) / 255.   # /= 255 for normolisation
print (x_train_array.shape)
x_train_array = np.expand_dims(x_train_array, axis = 3)
print (x_train_array.shape)

(10222, 100, 100)
(10222, 100, 100, 1)


In [67]:
y_train_array = np.array(y_feature, np.uint8)
print (y_train_array.shape)

(10222, 120)


In [68]:
x_train, x_test, y_train, y_test = train_test_split(x_train_array, y_train_array, test_size=0.2, random_state=2)
print (x_train.shape)
print (x_val.shape)

(8177, 100, 100, 1)
(2045, 100, 100, 1)


In [70]:
x_test_feature = []

i = 0 # initialisation
for f in tqdm(test_img.values): # f for format ,jpg
    img = cv2.imread('C:\\Users\\Bill\\.kaggle\\competitions\\dog-breed-identification\\test/{}.jpg'.format(f), 0)
    img_resize = cv2.resize(img, (img_size, img_size)) 
    x_test_feature.append(img_resize)

100%|███████████████████████████████████| 10357/10357 [00:16<00:00, 616.23it/s]


In [73]:
x_test_array = np.array(x_test_feature, np.float32) / 255. 
print (x_test_array.shape)
x_test_array = np.expand_dims(x_test_array, axis = 3)
print (x_test_array.shape)

(10357, 100, 100)
(10357, 100, 100, 1)


### Submit to Kaggle Preprocess 

In [69]:
submission = pd.read_csv('C:\\Users\\Bill\\.kaggle\\competitions\\dog-breed-identification\\submission.csv')
test_img = submission['id']
test_img.head()

0    000621fb3cbb32d8935728e48679680e
1    00102ee9d8eb90812350685311fe5890
2    0012a730dfa437f5f3613fb75efcd4ce
3    001510bc8570bbeee98c8d80c8a95ec1
4    001a5f3114548acdefa3d4da05474c2e
Name: id, dtype: object

## Build a 2 Conv Sequetial CNN Model
- This basic model was designed to run on a CPU using Keras with a Tensorflow backend. 

In [52]:
model = Sequential()

model.add(Conv2D (16, (3,3), input_shape = (img_rows, img_cols, num_channel)))
model.add(BatchNormalization())
model.add(Activation("relu"))
model.add(MaxPooling2D(pool_size=(4,4)))
model.add(Dropout(0.2))

model.add(Conv2D (32, (3,3))) 
model.add(BatchNormalization())
model.add(Activation("relu"))
model.add(MaxPooling2D(pool_size=(4,4)))
model.add(Dropout(0.2))

model.add(Flatten()) 
model.add(Dropout(0.5))
model.add(Dense(256, activation = 'relu')) # fully connected Layer

model.add(Dense(120, activation = 'softmax')) # output layer

### Compile Model

In [53]:
model.compile(optimizer = 'adam' , loss = "categorical_crossentropy", metrics=["accuracy"]) 
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_7 (Conv2D)            (None, 126, 126, 16)      160       
_________________________________________________________________
batch_normalization_2 (Batch (None, 126, 126, 16)      64        
_________________________________________________________________
activation_1 (Activation)    (None, 126, 126, 16)      0         
_________________________________________________________________
max_pooling2d_5 (MaxPooling2 (None, 31, 31, 16)        0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 31, 31, 16)        0         
_________________________________________________________________
conv2d_8 (Conv2D)            (None, 29, 29, 32)        4640      
_________________________________________________________________
batch_normalization_3 (Batch (None, 29, 29, 32)        128       
__________

### Fit the model

In [59]:
batch_size = 64
nb_epochs = 3
history = model.fit(x_train, y_train,batch_size=batch_size,epochs=nb_epochs,verbose=2, validation_data=(x_test, y_test),initial_epoch=0)

Train on 8177 samples, validate on 2045 samples
Epoch 1/3
 - 290s - loss: 4.7654 - acc: 0.0116 - val_loss: 4.7459 - val_acc: 0.0176
Epoch 2/3
 - 289s - loss: 4.6989 - acc: 0.0181 - val_loss: 4.6771 - val_acc: 0.0156
Epoch 3/3
 - 288s - loss: 4.6300 - acc: 0.0187 - val_loss: 4.6233 - val_acc: 0.0166


## Predict the model and write the submission csv 

In [60]:
results = model.predict(x_test_data)
prediction = pd.DataFrame(results)

In [61]:
col_names = one_hot.columns.values
prediction.columns = col_names

In [62]:
prediction.insert(0, 'id', submission['id'])

In [63]:
submission = prediction
submission.to_csv('my_submission.csv', index=False)

#### .

In [75]:
prediction.head()

Unnamed: 0,id,affenpinscher,afghan_hound,african_hunting_dog,airedale,american_staffordshire_terrier,appenzeller,australian_terrier,basenji,basset,...,toy_poodle,toy_terrier,vizsla,walker_hound,weimaraner,welsh_springer_spaniel,west_highland_white_terrier,whippet,wire-haired_fox_terrier,yorkshire_terrier
0,000621fb3cbb32d8935728e48679680e,0.000448,0.022789,0.001101,0.004145,0.005541,0.003519,0.002822,0.018993,0.004655,...,0.009264,0.009615,0.004057,0.00968,0.007494,0.01296,0.015013,0.015072,0.019072,0.008264
1,00102ee9d8eb90812350685311fe5890,0.000297,0.013144,0.001152,0.005504,0.005678,0.002547,0.002432,0.015062,0.004709,...,0.012506,0.010493,0.004034,0.007347,0.007437,0.009824,0.022885,0.014547,0.016485,0.007689
2,0012a730dfa437f5f3613fb75efcd4ce,0.006374,0.011893,0.006798,0.008126,0.008837,0.009413,0.006824,0.010036,0.010208,...,0.00927,0.009989,0.008703,0.010135,0.007764,0.010669,0.009358,0.009969,0.008776,0.007797
3,001510bc8570bbeee98c8d80c8a95ec1,0.008401,0.009082,0.00919,0.00927,0.007824,0.008011,0.008915,0.008841,0.008429,...,0.007984,0.008392,0.008276,0.008043,0.00805,0.007944,0.007573,0.008725,0.007738,0.007864
4,001a5f3114548acdefa3d4da05474c2e,0.006124,0.014334,0.00573,0.00703,0.009459,0.011219,0.005714,0.010421,0.011766,...,0.009493,0.010744,0.009023,0.011946,0.007212,0.013003,0.009601,0.010277,0.008926,0.007416
