[View in Colaboratory](https://colab.research.google.com/github/trainingroom/datascience/blob/master/cats_vs_dogs_cnn_dataAugmentation.ipynb)

In [0]:
!pip install kaggle

In [0]:
from googleapiclient.discovery import build
import io, os
from googleapiclient.http import MediaIoBaseDownload
from google.colab import auth
auth.authenticate_user()
drive_service = build('drive', 'v3')
results = drive_service.files().list(
        q="name = 'kaggle.json'", fields="files(id)").execute()
kaggle_api_key = results.get('files', [])
filename = "/content/.kaggle/kaggle.json"
os.makedirs(os.path.dirname(filename), exist_ok=True)
request = drive_service.files().get_media(fileId=kaggle_api_key[0]['id'])
fh = io.FileIO(filename, 'wb')
downloader = MediaIoBaseDownload(fh, request)

In [3]:
done = False
while done is False:
    status, done = downloader.next_chunk()
    print("Download %d%%." % int(status.progress() * 100))
os.chmod(filename, 600)

Download 100%.


In [4]:
!kaggle competitions download -c dogs-vs-cats -p /content/kaggle/dogs-vs-cats

Downloading sampleSubmission.csv to /content/kaggle/dogs-vs-cats
  0%|                                               | 0.00/86.8k [00:00<?, ?B/s]
100%|██████████████████████████████████████| 86.8k/86.8k [00:00<00:00, 37.6MB/s]
Downloading test1.zip to /content/kaggle/dogs-vs-cats
 92%|█████████████████████████████████████▋   | 249M/271M [00:01<00:00, 188MB/s]
100%|█████████████████████████████████████████| 271M/271M [00:01<00:00, 157MB/s]
Downloading train.zip to /content/kaggle/dogs-vs-cats
 99%|████████████████████████████████████████▋| 539M/543M [00:04<00:00, 130MB/s]
100%|█████████████████████████████████████████| 543M/543M [00:04<00:00, 133MB/s]


In [0]:
import zipfile
import os
os.chdir('/content/kaggle/dogs-vs-cats')
zip_ref = zipfile.ZipFile('train.zip', 'r')
zip_ref.extractall('/content/kaggle/dogs-vs-cats/extracted')
zip_ref.close()

In [0]:

import os
import shutil
import matplotlib.pyplot as plt
import random

def preapare_full_dataset_for_flow(train_dir_original, test_dir_original, target_base_dir, val_percent=0.2):
    train_dir = os.path.join(target_base_dir, 'train')
    validation_dir = os.path.join(target_base_dir, 'validation')
    test_dir = os.path.join(target_base_dir, 'test')

    if not os.path.exists(target_base_dir):          
        os.mkdir(target_base_dir)
        os.mkdir(train_dir)
        os.mkdir(validation_dir)
        os.mkdir(test_dir)
        for c in ['dogs', 'cats']: 
            os.mkdir(os.path.join(train_dir, c))
            os.mkdir(os.path.join(validation_dir, c))
        os.mkdir(os.path.join(test_dir, 'images'))
        print('created the required directory structure')
        
        files = os.listdir(train_dir_original)
        train_files = [os.path.join(train_dir_original, f) for f in files]
        random.shuffle(train_files)    
        n = int(len(train_files) * val_percent)
        val = train_files[:n]
        train = train_files[n:]  

        for t in train:
            if 'cat' in t:
                shutil.copy2(t, os.path.join(train_dir, 'cats'))
            else:
                shutil.copy2(t, os.path.join(train_dir, 'dogs'))
     
        for v in val:
            if 'cat' in v:
                shutil.copy2(v, os.path.join(validation_dir, 'cats'))
            else:
                shutil.copy2(v, os.path.join(validation_dir, 'dogs'))
        files = os.listdir(test_dir_original)
        test_files = [os.path.join(test_dir_original, f) for f in files]
        for t in test_files:
            shutil.copy2(t, os.path.join(test_dir, 'images'))
    else:
        print('required directory structure already exists. learning continues with existing data')

    nb_train_samples = 0  
    nb_validation_samples = 0
    for c in ['dogs', 'cats']:
        nb_train_samples = nb_train_samples + len(os.listdir(os.path.join(train_dir, c)))
    print('total training images:', nb_train_samples)
    for c in ['dogs', 'cats']:
        nb_validation_samples = nb_validation_samples + len(os.listdir(os.path.join(validation_dir, c)))
    print('total validation images:', nb_validation_samples)
    nb_test_samples = len(os.listdir(os.path.join(test_dir, 'images')))
    print('total test images:', nb_test_samples )
    
    return train_dir, validation_dir, test_dir, nb_train_samples, nb_validation_samples, nb_test_samples

def preapare_small_dataset_for_flow(train_dir_original, test_dir_original, target_base_dir):
    train_dir = os.path.join(target_base_dir, 'train')
    validation_dir = os.path.join(target_base_dir, 'validation')
    test_dir = os.path.join(target_base_dir, 'test')

    if not os.path.exists(target_base_dir):          
        os.mkdir(target_base_dir)
        os.mkdir(train_dir)
        os.mkdir(validation_dir)
        os.mkdir(test_dir)
        for c in ['dogs', 'cats']: 
            os.mkdir(os.path.join(train_dir, c))
            os.mkdir(os.path.join(validation_dir, c))
        os.mkdir(os.path.join(test_dir, 'images'))
        print('created the required directory structure')        
       
        train_cats = ['cat.{}.jpg'.format(i) for i in range(11000)]
        for t in train_cats:
             shutil.copy2(os.path.join(train_dir_original, t), os.path.join(train_dir, 'cats'))
        train_dogs = ['dog.{}.jpg'.format(i) for i in range(11000)]
        for t in train_dogs:
             shutil.copy2(os.path.join(train_dir_original, t), os.path.join(train_dir, 'dogs'))        
        val_cats = ['cat.{}.jpg'.format(i) for i in range(11000, 12500)]
        for t in val_cats:
             shutil.copy2(os.path.join(train_dir_original, t), os.path.join(validation_dir, 'cats'))
        val_dogs = ['dog.{}.jpg'.format(i) for i in range(11000, 12500)]
        for t in val_dogs:
             shutil.copy2(os.path.join(train_dir_original, t), os.path.join(validation_dir, 'dogs'))

        files = os.listdir(test_dir_original)           
        test_files = [os.path.join(test_dir_original, f) for f in files]
        for t in test_files:
            shutil.copy2(t, os.path.join(test_dir, 'images'))
    else:
        print('required directory structure already exists. learning continues with existing data')
    
    nb_train_samples = 0  
    nb_validation_samples = 0
    for c in ['dogs', 'cats']:
        nb_train_samples = nb_train_samples + len(os.listdir(os.path.join(train_dir, c)))
    print('total training images:', nb_train_samples)
    for c in ['dogs', 'cats']:
        nb_validation_samples = nb_validation_samples + len(os.listdir(os.path.join(validation_dir, c)))
    print('total validation images:', nb_validation_samples)
    nb_test_samples = len(os.listdir(os.path.join(test_dir, 'images')))
    print('total test images:', nb_test_samples )

    return train_dir, validation_dir, test_dir, nb_train_samples, nb_validation_samples, nb_test_samples
def plot_loss_accuracy(history):
    acc = history.history['acc']
    val_acc = history.history['val_acc']
    epochs = range(len(history.epoch))
    plt.plot(epochs, acc, 'bo', label='Training acc')
    plt.plot(epochs, val_acc, 'b', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    
    plt.figure()
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    plt.plot(epochs, loss, 'bo', label='Training loss')
    plt.plot(epochs, val_loss, 'b', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()

    plt.show()

In [8]:

from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D
from keras.layers import Flatten, Dense, Dropout
from keras import backend as K
import os, shutil, random
import matplotlib.pyplot as plt
import pandas as pd
from keras.callbacks import ModelCheckpoint, EarlyStopping

Using TensorFlow backend.


In [9]:
train_dir, validation_dir, test_dir, nb_train_samples, nb_validation_samples,nb_test_samples = preapare_full_dataset_for_flow(
                            train_dir_original='/content/kaggle/dogs-vs-cats/extracted/train', 
                            test_dir_original='/content/kaggle/dogs-vs-cats/extracted/test1',
                            target_base_dir='/content/kaggle/dogs-vs-cats/extracted/target')

created the required directory structure
total training images: 20000
total validation images: 5000
total test images: 12500


In [0]:


img_width, img_height = 150, 150
epochs = 50
batch_size = 20

if K.image_data_format() == 'channels_first':
    input_shape = (3, img_width, img_height)
else:
    input_shape = (img_width, img_height, 3)

In [11]:

model = Sequential()
model.add(Conv2D(32, (3, 3), activation='relu',input_shape=input_shape))
model.add(MaxPooling2D((2, 2)))
model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D((2, 2)))
model.add(Conv2D(128, (3, 3), activation='relu'))
model.add(MaxPooling2D((2, 2)))
model.add(Conv2D(128, (3, 3), activation='relu'))
model.add(MaxPooling2D((2, 2)))
model.add(Flatten())
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(2, activation='softmax'))
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 148, 148, 32)      896       
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 74, 74, 32)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 72, 72, 64)        18496     
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 36, 36, 64)        0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 34, 34, 128)       73856     
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, 17, 17, 128)       0         
_________________________________________________________________
conv2d_4 (Conv2D)            (None, 15, 15, 128)       147584    
__________

In [0]:

model.compile(loss='binary_crossentropy', 
              optimizer='adam',
              metrics=['accuracy'])

In [0]:

#Data Augmentation (New Data Generation)
train_datagen = ImageDataGenerator(
        rescale=1./255,
        rotation_range=40,
        width_shift_range=0.2,
        height_shift_range=0.2,
        shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True,
        fill_mode='nearest')

In [0]:
#If we want, you can write all these Augmented data into new files
validation_datagen = ImageDataGenerator(rescale=1. / 255)

In [15]:

train_generator = train_datagen.flow_from_directory(
    train_dir,
    target_size=(img_width, img_height),
    batch_size=batch_size,
    class_mode='categorical')
validation_generator = validation_datagen.flow_from_directory(
    validation_dir,
    target_size=(img_width, img_height),
    batch_size=batch_size,
    class_mode='categorical')

early_stopping = EarlyStopping(monitor='val_loss', patience=10, verbose=1, mode='auto')   
save_weights = ModelCheckpoint('model.h5', monitor='val_loss', save_best_only=True)

Found 20000 images belonging to 2 classes.
Found 5000 images belonging to 2 classes.


In [16]:
history = model.fit_generator(
    train_generator,
    steps_per_epoch=nb_train_samples//batch_size,
    epochs=epochs,
    validation_data=validation_generator,
    validation_steps=nb_validation_samples//batch_size,
    callbacks=[save_weights, early_stopping])


Epoch 1/100
Epoch 2/100
   4/1000 [..............................] - ETA: 1:04 - loss: 1.0960e-07 - acc: 1.0000

Epoch 3/100
  49/1000 [>.............................] - ETA: 2:24 - loss: 1.0960e-07 - acc: 1.0000

Epoch 4/100
  67/1000 [=>............................] - ETA: 2:35 - loss: 1.0960e-07 - acc: 1.0000

Epoch 5/100
  62/1000 [>.............................] - ETA: 2:30 - loss: 1.0960e-07 - acc: 1.0000

Epoch 6/100
  72/1000 [=>............................] - ETA: 2:31 - loss: 1.0960e-07 - acc: 1.0000

Epoch 7/100
  63/1000 [>.............................] - ETA: 2:32 - loss: 1.0960e-07 - acc: 1.0000

Epoch 8/100
  66/1000 [>.............................] - ETA: 2:30 - loss: 1.0960e-07 - acc: 1.0000

Epoch 9/100
  76/1000 [=>............................] - ETA: 2:30 - loss: 1.0960e-07 - acc: 1.0000

Epoch 10/100
  75/1000 [=>............................] - ETA: 2:33 - loss: 1.0960e-07 - acc: 1.0000

Epoch 11/100
  73/1000 [=>............................] - ETA: 2:32 - loss: 1.0960e-07 - acc: 1.0000

Epoch 12/100
  74/1000 [=>............................] - ETA: 2:32 - loss: 1.0960e-07 - acc: 1.0000

Epoch 13/100
  71/1000 [=>............................] - ETA: 2:33 - loss: 1.0960e-07 - acc: 1.0000

Epoch 14/100
  76/1000 [=>............................] - ETA: 2:32 - loss: 1.0960e-07 - acc: 1.0000

Epoch 15/100
  70/1000 [=>............................] - ETA: 2:32 - loss: 1.0960e-07 - acc: 1.0000

Epoch 16/100
  59/1000 [>.............................] - ETA: 2:33 - loss: 1.0960e-07 - acc: 1.0000

Epoch 17/100
  53/1000 [>.............................] - ETA: 2:29 - loss: 1.0960e-07 - acc: 1.0000

Epoch 18/100
  65/1000 [>.............................] - ETA: 2:32 - loss: 1.0960e-07 - acc: 1.0000

Epoch 19/100
  71/1000 [=>............................] - ETA: 2:31 - loss: 1.0960e-07 - acc: 1.0000

Epoch 20/100
  72/1000 [=>............................] - ETA: 2:31 - loss: 1.0960e-07 - acc: 1.0000

Epoch 21/100
  78/1000 [=>............................] - ETA: 2:34 - loss: 1.0960e-07 - acc: 1.0000

Epoch 00021: early stopping


In [17]:
test_datagen = ImageDataGenerator(rescale=1. / 255)
test_generator = test_datagen.flow_from_directory(
        test_dir,
        target_size=(img_width, img_height),
        batch_size=batch_size,
        class_mode=None,
        shuffle=False)
#print(test_generator.filenames)
probabilities = model.predict_generator(test_generator, nb_test_samples//batch_size)

mapper = {}
i = 0

Found 12500 images belonging to 1 classes.


In [0]:
for file in test_generator.filenames:
    id = int(file.split('/')[1].split('.')[0])
    #Lexographic order
    #mapper[id] = probabilities[i][0] #Cats
    mapper[id] = probabilities[i][1] #Dogs
    i += 1
    
#od = collections.OrderedDict(sorted(mapper.items()))    
tmp = pd.DataFrame({'id':list(mapper.keys()),'label':list(mapper.values())})    
tmp.to_csv('submission.csv', columns=['id','label'], index=False)

In [20]:
os.getcwd()
os.listdir()

['extracted',
 'test1.zip',
 'model.h5',
 'submission.csv',
 'sampleSubmission.csv',
 'train.zip']