In [1]:
import cv2 # for image processing
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from tqdm import tqdm
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import cv2

from __future__ import print_function

from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.optimizers import SGD
from keras.utils import np_utils


Using TensorFlow backend.


In [2]:
np.random.seed(1671)  # for reproducibility

In [31]:
NB_EPOCH = 100
BATCH_SIZE = 32
VERBOSE = 1
NB_CLASSES = 120   # number of outputs = number of digits
OPTIMIZER = SGD() # SGD optimizer, explained later in this chapter
N_HIDDEN = 128
VALIDATION_SPLIT=0.2 # how much TRAIN is reserved for VALIDATION

In [4]:
# taking the labels for the images
labels = pd.read_csv('data/Dog_breed/labels.csv/labels.csv')
print(labels.head())

                                 id             breed
0  000bec180eb18c7604dcecc8fe0dba07       boston_bull
1  001513dfcb2ffafc82cccf4d8bbaba97             dingo
2  001cdf01b096e06d78e9e5112d419397          pekinese
3  00214f311d5d2247d5dfe4fe24b2303d          bluetick
4  0021f9ceb3235effd7fcde7f7538ed62  golden_retriever


In [5]:
targets_series = pd.Series(labels['breed'])
one_hot = pd.get_dummies(targets_series, sparse = True)

In [6]:
one_hot_labels = np.asarray(one_hot)

In [7]:
im_size = 90
x = []
y = []
x_submission = []

In [8]:
i = 0 
for f, breed in tqdm(labels.values):
    img = cv2.imread('data/Dog_breed/train/{}.jpg'.format(f))
    label = one_hot_labels[i]
    x.append(cv2.resize(img, (im_size, im_size)))
    y.append(label)
    i += 1

100%|██████████| 10222/10222 [02:05<00:00, 81.56it/s]


In [9]:
df_test = pd.read_csv('data/Dog_breed/sample_submission.csv/sample_submission.csv')

In [10]:
for f in tqdm(df_test['id'].values):
    img = cv2.imread('data/Dog_breed/test/{}.jpg'.format(f))
    x_submission.append(cv2.resize(img, (im_size, im_size)))

100%|██████████| 10357/10357 [02:17<00:00, 75.44it/s]


In [11]:
x = np.array(x)

In [12]:
x_submission = np.array(x_submission)

In [13]:
RESHAPED = 24300

In [14]:
X = x.reshape(x.shape[0], RESHAPED)

In [15]:
X_submission = x_submission.reshape(x_submission.shape[0], RESHAPED)

In [16]:
X = X.astype('float32')

In [17]:
X_submission = X_submission.astype('float32')

In [18]:
X/=255
X_submission/=255

In [19]:
X.shape

(10222, 24300)

In [20]:
print(X.shape[0], 'train samples')
print(X_submission.shape[0], 'test samples')


10222 train samples
10357 test samples


In [21]:
y = np.array(y)


In [22]:
Y = np_utils.to_categorical(y, NB_CLASSES)

In [23]:
Y

array([[ 1.,  1.,  0., ...,  0.,  0.,  0.],
       [ 1.,  1.,  0., ...,  0.,  0.,  0.],
       [ 1.,  1.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 1.,  1.,  0., ...,  0.,  0.,  0.],
       [ 1.,  1.,  0., ...,  0.,  0.,  0.],
       [ 1.,  1.,  0., ...,  0.,  0.,  0.]])

In [32]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.10, random_state=42)

In [33]:
model = Sequential()
model.add(Dense(NB_CLASSES, input_shape= (RESHAPED,)))
#Softmax squashes a k-dimensional vector of arbitrary real values into a kdimensional
#vector of real values in the range (0, 1).
model.add(Activation('softmax'))

model.summary()

model.compile(loss='categorical_crossentropy',
              optimizer=OPTIMIZER,
              metrics=['accuracy'])

history = model.fit(X_train, y_train,
                    batch_size=BATCH_SIZE, nb_epoch=NB_EPOCH,
                    verbose=VERBOSE, validation_split=VALIDATION_SPLIT)


____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
dense_2 (Dense)                  (None, 120)           2916120     dense_input_2[0][0]              
____________________________________________________________________________________________________
activation_2 (Activation)        (None, 120)           0           dense_2[0][0]                    
Total params: 2,916,120
Trainable params: 2,916,120
Non-trainable params: 0
____________________________________________________________________________________________________
Train on 7359 samples, validate on 1840 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 

In [34]:
score = model.evaluate(X_test, y_test, verbose=VERBOSE)



In [35]:
print("\nTest score:", score[0])
print('Test accuracy:', score[1])


Test score: 16.1180953979
Test accuracy: 1.0


In [36]:
preds = model.predict(X_submission, verbose=1)



In [37]:
sub = pd.DataFrame(preds)
# Set column names to those generated by the one-hot encoding earlier
col_names = one_hot.columns.values
sub.columns = col_names
# Insert the column id from the sample_submission at the start of the data frame
sub.insert(0, 'id', df_test['id'])
sub.head(5)


Unnamed: 0,id,affenpinscher,afghan_hound,african_hunting_dog,airedale,american_staffordshire_terrier,appenzeller,australian_terrier,basenji,basset,...,toy_poodle,toy_terrier,vizsla,walker_hound,weimaraner,welsh_springer_spaniel,west_highland_white_terrier,whippet,wire-haired_fox_terrier,yorkshire_terrier
0,000621fb3cbb32d8935728e48679680e,1.0,3.89638e-27,3.793541e-33,1.567988e-33,3.3814160000000004e-33,1.220703e-33,3.6402890000000005e-33,3.409887e-33,2.87492e-33,...,2.950359e-33,5.131241e-33,4.0231660000000004e-33,3.8041200000000004e-33,3.939276e-33,5.6740070000000004e-33,3.915514e-33,2.5006720000000002e-33,5.0895240000000004e-33,3.0633730000000003e-33
1,00102ee9d8eb90812350685311fe5890,1.0,8.713766000000001e-33,0.0,0.0,0.0,0.0,1.8150049999999999e-38,0.0,0.0,...,2.77163e-38,1.3573139999999998e-38,1.5514609999999997e-38,1.8391619999999998e-38,1.568205e-38,1.479748e-38,2.364703e-38,0.0,0.0,0.0
2,0012a730dfa437f5f3613fb75efcd4ce,1.0,2.156441e-23,1.055942e-27,6.453466000000001e-28,5.374105e-28,5.19283e-28,4.954145e-28,1.194396e-27,6.590195e-28,...,6.784881e-28,8.722733000000001e-28,1.090285e-27,1.008641e-27,1.080064e-27,8.136501000000001e-28,7.580206e-28,4.8662610000000005e-28,8.529745e-28,1.0081710000000001e-27
3,001510bc8570bbeee98c8d80c8a95ec1,1.0,1.852027e-28,4.587166e-37,3.742222e-37,6.072048e-37,3.469291e-37,4.487096e-37,5.274521e-37,6.241133e-37,...,3.658937e-37,9.368692e-37,7.286943e-37,9.101324999999999e-37,5.655627e-37,5.633879e-37,4.667978e-37,4.011358e-37,5.240466e-37,5.587264e-37
4,001a5f3114548acdefa3d4da05474c2e,1.0,7.916199e-25,6.498569e-30,5.320879e-30,6.500254999999999e-30,5.48639e-30,5.6927689999999996e-30,7.755311e-30,4.521563e-30,...,6.17652e-30,1.05079e-29,1.3869810000000002e-29,1.022407e-29,8.659468e-30,4.2878919999999995e-30,5.8227919999999996e-30,8.879508999999999e-30,1.495663e-29,1.062689e-29


In [38]:
sub.to_csv("out.csv")

#### Even though the epochs was increased from 5 to 100 the result was still the same on the kaggle leaderboard.