In [14]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as mplot
import pandas as pd
import keras
import os
from keras.preprocessing import image

from keras.layers import Input,Flatten
from keras.optimizers import SGD,RMSprop

In [3]:
HOME_DIR = os.getcwd()

In [4]:
def create_directory_structure(competition):
    TRAIN_PATH = os.path.join(HOME_DIR,'data/%s/train'%competition)
    TEST_PATH = os.path.join(HOME_DIR,'data/%s/test'%competition)
    VALID_PATH = os.path.join(HOME_DIR,'data/%s/valid'%competition)
    TEST_UNKNOWN_PATH = os.path.join(TEST_PATH,'unknown')
    return TRAIN_PATH,TEST_PATH,VALID_PATH,TEST_UNKNOWN_PATH

TRAIN_PATH,TEST_PATH,VALID_PATH,TEST_UNKNOWN_PATH = create_directory_structure('dog_breed')
print TRAIN_PATH,TEST_PATH,VALID_PATH,TEST_UNKNOWN_PATH
if not os.path.exists(TRAIN_PATH): os.mkdir(TRAIN_PATH)
if not os.path.exists(TEST_PATH): os.mkdir(TEST_PATH)
if not os.path.exists(VALID_PATH): os.mkdir(VALID_PATH)
if not os.path.exists(TEST_UNKNOWN_PATH): os.mkdir(TEST_UNKNOWN_PATH)

/home/ubuntu/data/dog_breed/train /home/ubuntu/data/dog_breed/test /home/ubuntu/data/dog_breed/valid /home/ubuntu/data/dog_breed/test/unknown


In [46]:
def move_test_files_to_unknown_dir(TEST_PATH,TEST_UNKNOWN_PATH):
    files = list(os.walk(TEST_PATH))[0][2]
    for file_name in files:
        old = os.path.join(TEST_PATH,file_name)
        new = os.path.join(TEST_UNKNOWN_PATH,file_name)
        os.renames(old,new)

In [30]:
labels = pd.read_csv(os.path.join(HOME_DIR,'data/dog_breed/labels.csv'))
print labels.breed.unique().shape
print labels.id.unique().shape
print labels.shape
labels.head(3)

(120,)
(10222,)
(10222, 2)


Unnamed: 0,id,breed
0,000bec180eb18c7604dcecc8fe0dba07,boston_bull
1,001513dfcb2ffafc82cccf4d8bbaba97,dingo
2,001cdf01b096e06d78e9e5112d419397,pekinese


In [17]:
id2breed = dict(zip(labels['id'].values,labels['breed'].values))

In [21]:
filenames = list(os.walk(TRAIN_PATH))[0][2]

In [33]:
for idx,filename in enumerate(filenames):
    breed = id2breed[filename.split('.')[0]]
    new = os.path.join(os.path.join(TRAIN_PATH,breed),breed+'.%s.jpg'%idx)
    old = os.path.join(TRAIN_PATH,filename)
    os.renames(old,new)

In [40]:
## Logic to move 20% files to validation dataset
for dir_name in os.listdir(TRAIN_PATH):
    files = list(os.walk(os.path.join(TRAIN_PATH,dir_name)))[0][2]
    files = np.random.permutation(files)
    for filename in files[:int(len(files)*.20)]:
        old = os.path.join(TRAIN_PATH,dir_name,filename)
        new = os.path.join(VALID_PATH,dir_name,filename)
        os.renames(old,new)

In [8]:
batch_size = 8
gen = image.ImageDataGenerator(rotation_range=10, width_shift_range=0.1, 
       height_shift_range=0.1, shear_range=0.15, zoom_range=0.1, 
       channel_shift_range=10., horizontal_flip=True)
train_batches = gen.flow_from_directory(TRAIN_PATH,batch_size=batch_size,target_size=(224,224))
valid_batches = gen.flow_from_directory(VALID_PATH,batch_size=batch_size,target_size=(224,224))

Found 8221 images belonging to 120 classes.
Found 2001 images belonging to 120 classes.


In [9]:
from vgg16bn import Vgg16BN
vgg = Vgg16BN()
vgg.finetune(train_batches)
vgg.model.optimizer.lr = 0.01

In [10]:
vgg.model.fit_generator(train_batches, samples_per_epoch=train_batches.nb_sample, nb_epoch=3,
                validation_data=valid_batches, nb_val_samples=valid_batches.nb_sample)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f24dbe30550>

In [15]:
layers = vgg.model.layers
dense_start = [idx for idx,layer in enumerate(layers) if type(layer) is Flatten][0]
for layer in layers[dense_start+1:]:
    layer.trainable = True
vgg.model.compile(optimizer=SGD(lr=0.0001),loss='categorical_crossentropy',metrics=['accuracy'])

vgg.model.fit_generator(train_batches, samples_per_epoch=train_batches.nb_sample, nb_epoch=4,
                validation_data=valid_batches, nb_val_samples=valid_batches.nb_sample)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7f2528045350>

In [16]:
for layer in layers[12:]:
    layer.trainable = True
vgg.model.compile(optimizer=SGD(lr=0.00001),loss='categorical_crossentropy',metrics=['accuracy'])
vgg.model.fit_generator(train_batches, samples_per_epoch=train_batches.nb_sample, nb_epoch=5,
                validation_data=valid_batches, nb_val_samples=valid_batches.nb_sample)

Epoch 1/5

KeyboardInterrupt: 

In [None]:
test_batches = gen.flow_from_directory(directory = TEST_PATH,target_size=(224,224),batch_size = 64)
predictions = vgg.model.predict_generator(test_batches,test_batches.nb_sample)
print predictions.shape
#predictions[:1]
#predictions = np.clip(predictions,0.02,0.98)

In [None]:
filenames = [filename.split('/')[-1].split('.')[0] for filename in test_batches.filenames]
print len(filenames)

final_dict = dict(zip(vgg.classes,predictions.T))
final_dict.update({'id':filenames})
df = pd.DataFrame(final_dict)

In [None]:
df.head(3)

In [None]:
total_cols = df.columns.values.tolist()
total_cols.remove('id')
total_cols.insert(0,'id')
df = df[total_cols]
format_txt = '%s,'+','.join(['%f']*(len(total_cols)-1))
headers = ','.join(total_cols)
submission_file_name = 'submission_dog_breed.csv'
#df.head(3)
np.savetxt(submission_file_name,df.values, fmt=format_txt, header=headers, comments='')

In [None]:
#df.to_csv('submission.csv')
df = pd.read_csv('submission_dog_breed.csv')
df.head(3)

In [None]:
!kg submit -u 'sunny.2309@yahoo.in' -p 'dm1832dk8847' -c 'dog-breed-identification' 'submission_dog_breed.csv'

In [106]:
from IPython.display import FileLink
FileLink('submission_dog_breed.csv')