# Dogs vs Cats Redux from Scratch

## Imports, Constants and Settings

In [1]:
import os,sys
from shutil import copyfile
from PIL import Image

utils_path = os.path.abspath(os.path.join('./utils'))
if utils_path not in sys.path:
    sys.path.append(utils_path)
from utils import *

Using Theano backend.


In [2]:
CURRENT_DIR = os.getcwd()
LESSON_HOME_DIR = CURRENT_DIR
DATA_HOME_DIR = CURRENT_DIR + '/data/redux'
TEST_PATH = DATA_HOME_DIR + '/test/' #We use all the test data
RESULTS_PATH = DATA_HOME_DIR + '/results/'


PATH = DATA_HOME_DIR + '/sample/'

TRAIN_PATH = PATH + '/train/'
VALIDATION_PATH = PATH + '/validation/'

In [3]:
#Instantiate plotting tool
#In Jupyter notebooks, you will need to run this command before doing any plotting
%matplotlib inline

## Helper Functions

In [11]:
def setup_standard_dir_structure():
    %cd $DATA_HOME_DIR
    %mkdir validation
    %mkdir results
    # Moving all test images to a class directory of 'unknown', to more easily work with batches
    %mkdir test/unknown
    %mkdir -p sample/train
    %mkdir -p sample/test
    %mkdir -p sample/validation
    %mkdir -p sample/results

# TODO: Make directory-agnostic
def count_images_in_training_set():
    %cd $DATA_HOME_DIR/train
    path, dirs, files = os.walk('.').next()
    return len(files)

# TODO: Make directory-agnostic
# TODO: Factor out common stuff
def move_training_images_to_validation_set(percent):
    total_in_validation_set = int(total_images_in_training_set * (percent / 100.0))

    print('Moving {total} from training set to validation set.').format(total=str(total_in_validation_set))

    %cd $DATA_HOME_DIR/train
    all_training_jpegs = glob('*.jpg')
    shuffled_training_jpegs = np.random.permutation(all_training_jpegs)
    for i in range(total_in_validation_set):
        os.rename(shuffled_training_jpegs[i], DATA_HOME_DIR + '/validation/' + shuffled_training_jpegs[i])

# TODO: Make directory-agnostic
# TODO: Factor out common stuff
def copy_training_images_to_sample(percent):
    total_in_sample_training_set = int(total_images_in_training_set * (percent / 100.0))

    print('Copying {total} from training set to sample training set.').format(total=str(total_in_sample_training_set))

    all_training_jpegs = glob('*.jpg')
    shuffled_training_jpegs = np.random.permutation(all_training_jpegs)
    for i in range(total_in_sample_training_set):
        copyfile(shuffled_training_jpegs[i], DATA_HOME_DIR + '/sample/train/' + shuffled_training_jpegs[i])
        
# TODO: Make directory-agnostic
# TODO: Factor out common stuff
def copy_validation_images_to_sample(percent):
    %cd $DATA_HOME_DIR/validation
    total_in_sample_validation_set = int(total_images_in_training_set * (percent / 100.0))

    print('Copying {total} from validation set to sample validation set.').format(total=str(total_in_sample_validation_set))

    all_training_jpegs = glob('*.jpg')
    shuffled_training_jpegs = np.random.permutation(all_training_jpegs)
    for i in range(total_in_sample_validation_set):
         copyfile(shuffled_training_jpegs[i], DATA_HOME_DIR + '/sample/validation/' + shuffled_training_jpegs[i])
 
# TODO: Make directory-agnostic
# TODO: Factor out common stuff
def copy_test_images_to_sample(percent):
    %cd $DATA_HOME_DIR/test
    total_in_sample_test_set = int(total_images_in_training_set * (percent / 100.0))

    print('Copying {total} from test set to sample test set.').format(total=str(total_in_sample_test_set))

    all_test_jpegs = glob('*.jpg')
    shuffled_test_jpegs = np.random.permutation(all_test_jpegs)
    for i in range(total_in_sample_test_set):
         copyfile(shuffled_test_jpegs[i], DATA_HOME_DIR + '/sample/test/' + shuffled_test_jpegs[i])

## Downloading Kaggle Data

Go ahead and run the following:

```
kg config -u farlion -p -c dogs-vs-cats-redux-kernels-edition
```


In [7]:
!kg download

downloading https://www.kaggle.com/c/dogs-vs-cats-redux-kernels-edition/download/test.zip

test.zip 100% |######################################| Time: 0:04:24   1.0 MiB/s
downloading https://www.kaggle.com/c/dogs-vs-cats-redux-kernels-edition/download/train.zip

train.zip 100% |#####################################| Time: 0:08:48   1.0 MiB/s
downloading https://www.kaggle.com/c/dogs-vs-cats-redux-kernels-edition/download/sample_submission.csv

sample_submission.csv already downloaded !


Now we're placing them into the following structure

```
utils/
    vgg16.py
    utils.py
lesson1/
    redux.ipynb
    data/
        redux/
            train/
                cat.437.jpg
                dog.9924.jpg
                cat.1029.jpg
                dog.4374.jpg
            test/
                231.jpg
                325.jpg
                1235.jpg
                9923.jpg
```

In [8]:
!unzip -q test.zip
!unzip -q train.zip

In [9]:
#!rm test.zip train.zip

In [10]:
!mkdir -p data/redux
!mv test train data/redux/

## Action Plan
1. Create Validation and Sample sets
2. Rearrange image files into their respective directories 
3. Finetune and Train model
4. Generate predictions
5. Validate predictions
6. Submit predictions to Kaggle

## Create Validation and Sample sets

In [None]:
setup_standard_dir_structure()

In [None]:
print('We have {total} images in total in our training set.').format(total=str(count_images_in_training_set()))

In [None]:
move_training_images_to_validation_set(percent=10)

In [None]:
copy_training_images_to_sample(percent=1)

In [None]:
copy_validation_images_to_sample(percent=0.25)

In [None]:
copy_test_images_to_sample(percent=10)

## Split up image files into class directories

In [None]:
%cd $DATA_HOME_DIR/sample/train
%mkdir cats
%mkdir dogs
%mv cat.*.jpg cats/
%mv dog.*.jpg dogs/

%cd $DATA_HOME_DIR/sample/validation
%mkdir cats
%mkdir dogs
%mv cat.*.jpg cats/
%mv dog.*.jpg dogs/

%cd $DATA_HOME_DIR/validation
%mkdir cats
%mkdir dogs
%mv cat.*.jpg cats/
%mv dog.*.jpg dogs/

%cd $DATA_HOME_DIR/train
%mkdir cats
%mkdir dogs
%mv cat.*.jpg cats/
%mv dog.*.jpg dogs/

## Finetune and Train Model

In [None]:
%cd $DATA_HOME_DIR

vgg = Vgg16()

In [None]:
BATCH_SIZE = 64
EPOCHS = 3
LEARNING_RATE = 0.01

In [None]:
train_batches = vgg.get_batches(TRAIN_PATH, batch_size=BATCH_SIZE)
validation_batches = vgg.get_batches(VALIDATION_PATH, batch_size=BATCH_SIZE)

vgg.finetune(train_batches)

In [None]:
#Not sure if we set this for all fits
vgg.model.optimizer.lr = LEARNING_RATE

In [None]:
#For each epoch we test our model against the validation set
latest_weights_filename = None
for epoch in range(EPOCHS):
    print('Running epoch {}').format(epoch)
    vgg.fit(train_batches, validation_batches, nb_epoch=1)
    latest_weights_filename = 'ft{}d.h5'.format(epoch)
    vgg.model.save_weights(RESULTS_PATH + latest_weights_filename)
print('Completed {} fit operations').format(EPOCHS)

## Generate Predictions

In [None]:
#For every image, vgg.test() generates two probabilities 
#based on how we've ordered the cats/dogs directories.
#It looks like column one is cats and column two is dogs
prediction_batches, predictions = vgg.test(TEST_PATH, batch_size=BATCH_SIZE)

In [None]:
print(predictions[:5])

prediction_filenames = prediction_batches.filenames
print(prediction_filenames[:5])

In [None]:
# You can verify the column ordering by viewing some images
Image.open(test_path + filenames[2])

In [None]:
#Save our test results arrays so we can use them again later
save_array(results_path + 'predictions.dat', preds)
save_array(results_path + 'prediction_filenames.dat', filenames)

## Validate Predictions

Keras' *fit()* function conveniently shows us the value of the loss function, and the accuracy, after every epoch ("*epoch*" refers to one full run through all training examples). The most important metrics for us to look at are for the validation set, since we want to check for over-fitting. 

- **Tip**: with our first model we should try to overfit before we start worrying about how to reduce over-fitting - there's no point even thinking about regularization, data augmentation, etc if you're still under-fitting! (We'll be looking at these techniques shortly).

As well as looking at the overall metrics, it's also a good idea to look at examples of each of:
1. A few correct labels at random
2. A few incorrect labels at random
3. The most correct labels of each class (ie those with highest probability that are correct)
4. The most incorrect labels of each class (ie those with highest probability that are incorrect)
5. The most uncertain labels (ie those with probability closest to 0.5).

Let's see what we can learn from these examples. (In general, this is a particularly useful technique for debugging problems in the model. However, since this model is so simple, there may not be too much to learn at this stage.)

Calculate predictions on validation set, so we can find correct and incorrect examples:

In [None]:
vgg.model.load_weights(RESULTS_PATHHLTS_PATHULTS_PATH+latest_weights_filename)

In [None]:
validation_batches, validation_probabilities = vgg.test(VALIDATION_PATH, batch_size=BATCH_SIZE)
validation_filenames = validation_batches.filenames
expected_labels = validation_batches.classes # 0 or 1

#Round our predictions to 0/1 to generate labels
our_predictions = validation_probabilities[:,0]
our_labels = np.round(1-our_predictions)

## Submit Predictions to Kaggle