In [1]:
import cv2 # for image processing
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from tqdm import tqdm
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import cv2

from __future__ import print_function

from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.optimizers import SGD
from keras.utils import np_utils


Using TensorFlow backend.


In [2]:
np.random.seed(1671)  # for reproducibility

##### network and training

In [39]:
NB_EPOCH = 20
BATCH_SIZE = 64
VERBOSE = 1
NB_CLASSES = 120   # number of outputs = number of digits
OPTIMIZER = SGD() # SGD optimizer, explained later in this chapter
N_HIDDEN = 128
VALIDATION_SPLIT=0.2 # how much TRAIN is reserved for VALIDATION

In [4]:
# taking the labels for the images
labels = pd.read_csv('data/Dog_breed/labels.csv/labels.csv')
print(labels.head())

                                 id             breed
0  000bec180eb18c7604dcecc8fe0dba07       boston_bull
1  001513dfcb2ffafc82cccf4d8bbaba97             dingo
2  001cdf01b096e06d78e9e5112d419397          pekinese
3  00214f311d5d2247d5dfe4fe24b2303d          bluetick
4  0021f9ceb3235effd7fcde7f7538ed62  golden_retriever


In [5]:
targets_series = pd.Series(labels['breed'])
one_hot = pd.get_dummies(targets_series, sparse = True)

In [6]:
one_hot_labels = np.asarray(one_hot)

In [7]:
im_size = 90
x = []
y = []
x_submission = []

In [8]:
i = 0 
for f, breed in tqdm(labels.values):
    img = cv2.imread('data/Dog_breed/train/{}.jpg'.format(f))
    label = one_hot_labels[i]
    x.append(cv2.resize(img, (im_size, im_size)))
    y.append(label)
    i += 1

100%|██████████| 10222/10222 [02:05<00:00, 81.56it/s]


In [9]:
df_test = pd.read_csv('data/Dog_breed/sample_submission.csv/sample_submission.csv')

In [10]:
for f in tqdm(df_test['id'].values):
    img = cv2.imread('data/Dog_breed/test/{}.jpg'.format(f))
    x_submission.append(cv2.resize(img, (im_size, im_size)))

100%|██████████| 10357/10357 [02:17<00:00, 75.44it/s]


In [11]:
x = np.array(x)

In [12]:
x_submission = np.array(x_submission)

In [13]:
RESHAPED = 24300

In [14]:
X = x.reshape(x.shape[0], RESHAPED)

In [15]:
X_submission = x_submission.reshape(x_submission.shape[0], RESHAPED)

In [16]:
X = X.astype('float32')

In [17]:
X_submission = X_submission.astype('float32')

In [18]:
X/=255
X_submission/=255

In [19]:
X.shape

(10222, 24300)

In [20]:
print(X.shape[0], 'train samples')
print(X_submission.shape[0], 'test samples')


10222 train samples
10357 test samples


In [21]:
y = np.array(y)


In [22]:
Y = np_utils.to_categorical(y, NB_CLASSES)

In [23]:
Y

array([[ 1.,  1.,  0., ...,  0.,  0.,  0.],
       [ 1.,  1.,  0., ...,  0.,  0.,  0.],
       [ 1.,  1.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 1.,  1.,  0., ...,  0.,  0.,  0.],
       [ 1.,  1.,  0., ...,  0.,  0.,  0.],
       [ 1.,  1.,  0., ...,  0.,  0.,  0.]])

In [32]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.10, random_state=42)

In [40]:
# M_HIDDEN hidden layers
# 10 outputs
# final stage is softmax

In [41]:
model = Sequential()
model.add(Dense(N_HIDDEN, input_shape=(RESHAPED,)))
model.add(Activation('relu'))
model.add(Dense(N_HIDDEN))
model.add(Activation('relu'))
model.add(Dense(NB_CLASSES))
model.add(Activation('softmax'))
model.summary()

model.compile(loss='categorical_crossentropy',
              optimizer=OPTIMIZER,
              metrics=['accuracy'])

history = model.fit(X_train, y_train,
                    batch_size=BATCH_SIZE, nb_epoch=NB_EPOCH,
                    verbose=VERBOSE, validation_split=VALIDATION_SPLIT)


____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
dense_3 (Dense)                  (None, 128)           3110528     dense_input_3[0][0]              
____________________________________________________________________________________________________
activation_3 (Activation)        (None, 128)           0           dense_3[0][0]                    
____________________________________________________________________________________________________
dense_4 (Dense)                  (None, 128)           16512       activation_3[0][0]               
____________________________________________________________________________________________________
activation_4 (Activation)        (None, 128)           0           dense_4[0][0]                    
___________________________________________________________________________________________

In [42]:
score = model.evaluate(X_test, y_test, verbose=VERBOSE)



In [43]:
print("\nTest score:", score[0])
print('Test accuracy:', score[1])


Test score: 1.38859566274
Test accuracy: 1.0


In [44]:
preds = model.predict(X_submission, verbose=1)



In [46]:
sub = pd.DataFrame(preds)
# Set column names to those generated by the one-hot encoding earlier
col_names = one_hot.columns.values
sub.columns = col_names
# Insert the column id from the sample_submission at the start of the data frame
sub.insert(0,'id', df_test['id'])
sub.head(5)


Unnamed: 0,id,affenpinscher,afghan_hound,african_hunting_dog,airedale,american_staffordshire_terrier,appenzeller,australian_terrier,basenji,basset,...,toy_poodle,toy_terrier,vizsla,walker_hound,weimaraner,welsh_springer_spaniel,west_highland_white_terrier,whippet,wire-haired_fox_terrier,yorkshire_terrier
0,000621fb3cbb32d8935728e48679680e,0.511609,0.488384,9.225629e-08,8.491282e-09,1.221754e-08,9.672134e-09,6.206034e-08,8.858015e-08,2.787936e-08,...,8.481834e-08,2.342824e-08,9.786744e-09,1.199383e-08,3.100846e-07,4.844838e-08,1.470889e-08,9.205112e-09,1.20646e-07,3.89872e-07
1,00102ee9d8eb90812350685311fe5890,0.509203,0.490797,5.286141e-09,2.33734e-10,3.277121e-10,2.895046e-10,2.120018e-09,3.446607e-09,8.961821e-10,...,3.57092e-09,1.008358e-09,2.42629e-10,3.839887e-10,1.669401e-08,2.301385e-09,5.377716e-10,2.731575e-10,4.910104e-09,2.277852e-08
2,0012a730dfa437f5f3613fb75efcd4ce,0.507632,0.49219,2.735638e-06,3.456441e-07,4.236083e-07,4.498437e-07,1.621532e-06,2.471615e-06,9.409012e-07,...,2.541788e-06,9.582837e-07,3.625142e-07,5.013495e-07,5.680316e-06,1.652266e-06,5.649341e-07,4.146368e-07,2.799743e-06,7.476724e-06
3,001510bc8570bbeee98c8d80c8a95ec1,0.509758,0.490231,1.799282e-07,1.724071e-08,2.540543e-08,2.021838e-08,8.569853e-08,1.312429e-07,4.212346e-08,...,1.161113e-07,5.439528e-08,1.980268e-08,1.566763e-08,4.932983e-07,9.79979e-08,3.379993e-08,1.795772e-08,1.853936e-07,6.670349e-07
4,001a5f3114548acdefa3d4da05474c2e,0.509386,0.490567,6.80768e-07,9.023868e-08,1.082685e-07,8.621901e-08,4.631534e-07,7.216283e-07,2.280207e-07,...,5.620792e-07,2.380892e-07,1.00698e-07,9.017236e-08,1.651387e-06,4.144409e-07,1.518131e-07,9.855736e-08,7.707391e-07,2.394309e-06


In [47]:
sub.to_csv("out_1.csv")

### There was no improvement in the result. 16.
It could be the reason that too much data is being taken for test and validation