In [72]:
import cv2 # for image processing
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from tqdm import tqdm
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import cv2

from __future__ import print_function

from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.optimizers import RMSprop
from keras.utils import np_utils


##### We decide to randomly drop with the dropout probability some of the values propagated inside our internal dense network of hidden layers. In machine learning, this is a well-known form of regularization

In [2]:
np.random.seed(1671)  # for reproducibility

##### network and training

In [75]:
NB_EPOCH = 2
BATCH_SIZE = 64
VERBOSE = 1
NB_CLASSES = 120   # number of outputs = number of digits
OPTIMIZER = RMSprop() # SGD optimizer, explained later in this chapter
N_HIDDEN = 128
VALIDATION_SPLIT=0.2 # how much TRAIN is reserved for VALIDATION
DROPOUT = 0.3

In [4]:
# taking the labels for the images
labels = pd.read_csv('data/Dog_breed/labels.csv/labels.csv')
print(labels.head())

                                 id             breed
0  000bec180eb18c7604dcecc8fe0dba07       boston_bull
1  001513dfcb2ffafc82cccf4d8bbaba97             dingo
2  001cdf01b096e06d78e9e5112d419397          pekinese
3  00214f311d5d2247d5dfe4fe24b2303d          bluetick
4  0021f9ceb3235effd7fcde7f7538ed62  golden_retriever


In [5]:
targets_series = pd.Series(labels['breed'])
one_hot = pd.get_dummies(targets_series, sparse = True)

In [6]:
one_hot_labels = np.asarray(one_hot)

In [63]:
im_size = 90
x = []
y = []
x_submission = []

In [8]:
i = 0 
for f, breed in tqdm(labels.values):
    img = cv2.imread('data/Dog_breed/train/{}.jpg'.format(f))
    label = one_hot_labels[i]
    x.append(cv2.resize(img, (im_size, im_size)))
    y.append(label)
    i += 1

100%|██████████| 10222/10222 [02:05<00:00, 81.56it/s]


In [9]:
df_test = pd.read_csv('data/Dog_breed/sample_submission.csv/sample_submission.csv')

In [10]:
for f in tqdm(df_test['id'].values):
    img = cv2.imread('data/Dog_breed/test/{}.jpg'.format(f))
    x_submission.append(cv2.resize(img, (im_size, im_size)))

100%|██████████| 10357/10357 [02:17<00:00, 75.44it/s]


In [11]:
x = np.array(x)

In [12]:
x_submission = np.array(x_submission)

In [13]:
RESHAPED = 24300

In [14]:
X = x.reshape(x.shape[0], RESHAPED)

In [15]:
X_submission = x_submission.reshape(x_submission.shape[0], RESHAPED)

In [16]:
X = X.astype('float32')

In [17]:
X_submission = X_submission.astype('float32')

In [18]:
X/=255
X_submission/=255

In [19]:
X.shape

(10222, 24300)

In [20]:
print(X.shape[0], 'train samples')
print(X_submission.shape[0], 'test samples')


10222 train samples
10357 test samples


In [21]:
y = np.array(y)


In [22]:
Y = np_utils.to_categorical(y, NB_CLASSES)

In [23]:
Y

array([[ 1.,  1.,  0., ...,  0.,  0.,  0.],
       [ 1.,  1.,  0., ...,  0.,  0.,  0.],
       [ 1.,  1.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 1.,  1.,  0., ...,  0.,  0.,  0.],
       [ 1.,  1.,  0., ...,  0.,  0.,  0.],
       [ 1.,  1.,  0., ...,  0.,  0.,  0.]])

In [59]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.05, random_state=42)

In [40]:
# M_HIDDEN hidden layers
# 10 outputs
# final stage is softmax

In [76]:
model = Sequential()
model.add(Dense(N_HIDDEN, input_shape=(RESHAPED,)))
model.add(Activation('relu'))
model.add(Dropout(DROPOUT))
model.add(Dense(N_HIDDEN))
model.add(Activation('relu'))
model.add(Dropout(DROPOUT))
model.add(Dense(NB_CLASSES))
model.add(Activation('softmax'))
model.summary()

model.compile(loss='categorical_crossentropy',
              optimizer=OPTIMIZER,
              metrics=['accuracy'])

history = model.fit(X_train, y_train,
                    batch_size=BATCH_SIZE, nb_epoch=NB_EPOCH,
                    verbose=VERBOSE, validation_split=VALIDATION_SPLIT)


____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
dense_19 (Dense)                 (None, 128)           3110528     dense_input_9[0][0]              
____________________________________________________________________________________________________
activation_19 (Activation)       (None, 128)           0           dense_19[0][0]                   
____________________________________________________________________________________________________
dropout_9 (Dropout)              (None, 128)           0           activation_19[0][0]              
____________________________________________________________________________________________________
dense_20 (Dense)                 (None, 128)           16512       dropout_9[0][0]                  
___________________________________________________________________________________________

In [77]:
score = model.evaluate(X_test, y_test, verbose=VERBOSE)



In [78]:
print("\nTest score:", score[0])
print('Test accuracy:', score[1])


Test score: 1.38826262206
Test accuracy: 1.0


In [79]:
preds = model.predict(X_submission, verbose=1)



In [69]:
sub = pd.DataFrame(preds)
# Set column names to those generated by the one-hot encoding earlier
col_names = one_hot.columns.values
sub.columns = col_names
# Insert the column id from the sample_submission at the start of the data frame
sub.insert(0,'id', df_test['id'])
sub.head(5)


Unnamed: 0,id,affenpinscher,afghan_hound,african_hunting_dog,airedale,american_staffordshire_terrier,appenzeller,australian_terrier,basenji,basset,...,toy_poodle,toy_terrier,vizsla,walker_hound,weimaraner,welsh_springer_spaniel,west_highland_white_terrier,whippet,wire-haired_fox_terrier,yorkshire_terrier
0,000621fb3cbb32d8935728e48679680e,0.518612,0.481388,5.620302e-09,8.070852e-09,6.682184e-09,3.699132e-09,3.911088e-09,2.921118e-09,1.290808e-09,...,5.506919e-09,9.760791e-10,5.013177e-10,7.494976e-09,6.034191e-09,1.567237e-09,8.699262e-10,4.67179e-09,1.748816e-09,3.834336e-09
1,00102ee9d8eb90812350685311fe5890,0.52178,0.47822,6.948585e-10,1.131411e-09,8.420001e-10,4.74852e-10,4.614246e-10,3.395648e-10,1.823841e-10,...,6.754443e-10,8.165721e-11,5.494359e-11,7.329278e-10,6.544521e-10,1.723162e-10,6.355007e-11,5.436413e-10,2.21999e-10,4.847102e-10
2,0012a730dfa437f5f3613fb75efcd4ce,0.514727,0.48525,2.151393e-07,2.955244e-07,2.104948e-07,1.518729e-07,1.347309e-07,1.210495e-07,6.831367e-08,...,1.942867e-07,4.499255e-08,2.993334e-08,1.989607e-07,2.081502e-07,8.935012e-08,4.010974e-08,1.587783e-07,9.178515e-08,1.678264e-07
3,001510bc8570bbeee98c8d80c8a95ec1,0.518589,0.48141,1.033759e-08,1.211641e-08,1.424697e-08,7.239069e-09,7.011143e-09,4.135758e-09,3.055612e-09,...,1.199251e-08,1.414842e-09,9.336653e-10,1.653693e-08,8.089162e-09,3.292229e-09,1.622805e-09,9.786785e-09,3.542572e-09,8.416581e-09
4,001a5f3114548acdefa3d4da05474c2e,0.515686,0.484304,8.431363e-08,1.309392e-07,9.888107e-08,6.206069e-08,6.002058e-08,4.293369e-08,2.709974e-08,...,9.334423e-08,1.7211e-08,1.026023e-08,9.62218e-08,7.181468e-08,3.677409e-08,1.519932e-08,7.878992e-08,4.242249e-08,6.110628e-08


In [70]:
sub.to_csv("out_3.csv")

As a rule of thumb, if during the training we see that the loss increases on validation, after an initial
decrease, then we have a problem of model complexity that overfits training.