# NOAA Right Whales Basic Entry

## Imports, Constants and Settings

In [1]:
import os,sys
from shutil import copyfile
from PIL import Image
from keras.preprocessing import image

utils_path = os.path.abspath(os.path.join('./utils'))
if utils_path not in sys.path:
    sys.path.append(utils_path)
from utils import *

Using Theano backend.
 https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29

ERROR (theano.sandbox.cuda): nvcc compiler not found on $PATH. Check your nvcc installation and try again.


In [None]:
CURRENT_DIR = os.getcwd()
LESSON_HOME_DIR = CURRENT_DIR
DATA_HOME_DIR = CURRENT_DIR + '/data/redux'
TEST_PATH = DATA_HOME_DIR + '/test/' 
RESULTS_PATH = DATA_HOME_DIR + '/results/'


PATH = DATA_HOME_DIR + '/sample/'

TRAIN_PATH = PATH + 'train/'
VALIDATION_PATH = PATH + 'validation/'

In [None]:
#Instantiate plotting tool
#In Jupyter notebooks, you will need to run this command before doing any plotting
%matplotlib inline

## Helper Functions

In [None]:
def setup_standard_dir_structure():
    %cd $DATA_HOME_DIR
    %mkdir validation
    %mkdir results
    # Moving all test images to a class directory of 'unknown', to more easily work with batches
    %mkdir test/unknown
    %mkdir -p sample/train
    %mkdir -p sample/test
    %mkdir -p sample/validation
    %mkdir -p sample/results

# TODO: Make directory-agnostic
def count_images_in_training_set():
    %cd $DATA_HOME_DIR/train
    path, dirs, files = os.walk('.').next()
    return len(files)

# TODO: Make directory-agnostic
# TODO: Factor out common stuff
def move_training_images_to_validation_set(percent):
    total_in_validation_set = int(total_images_in_training_set * (percent / 100.0))

    print('Moving {total} from training set to validation set.').format(total=str(total_in_validation_set))

    %cd $DATA_HOME_DIR/train
    all_training_jpegs = glob('*.jpg')
    shuffled_training_jpegs = np.random.permutation(all_training_jpegs)
    for i in range(total_in_validation_set):
        os.rename(shuffled_training_jpegs[i], DATA_HOME_DIR + '/validation/' + shuffled_training_jpegs[i])

# TODO: Make directory-agnostic
# TODO: Factor out common stuff
def copy_training_images_to_sample(percent):
    total_in_sample_training_set = int(total_images_in_training_set * (percent / 100.0))

    print('Copying {total} from training set to sample training set.').format(total=str(total_in_sample_training_set))

    all_training_jpegs = glob('*.jpg')
    shuffled_training_jpegs = np.random.permutation(all_training_jpegs)
    for i in range(total_in_sample_training_set):
        copyfile(shuffled_training_jpegs[i], DATA_HOME_DIR + '/sample/train/' + shuffled_training_jpegs[i])
        
# TODO: Make directory-agnostic
# TODO: Factor out common stuff
def copy_validation_images_to_sample(percent):
    %cd $DATA_HOME_DIR/validation
    total_in_sample_validation_set = int(total_images_in_training_set * (percent / 100.0))

    print('Copying {total} from validation set to sample validation set.').format(total=str(total_in_sample_validation_set))

    all_training_jpegs = glob('*.jpg')
    shuffled_training_jpegs = np.random.permutation(all_training_jpegs)
    for i in range(total_in_sample_validation_set):
         copyfile(shuffled_training_jpegs[i], DATA_HOME_DIR + '/sample/validation/' + shuffled_training_jpegs[i])
           
def split_into_one_directory_per_class(dirs, classes):
    for dir in dirs:
        %cd $dir
        for categoryClass in classes:
            %mkdir $categoryClass
            %mv {categoryClass}.*.jpg {categoryClass}s/
            
def move_test_images_to_unknown_category_for_easier_batching():
    %cd $TEST_PATH
    %mv *.jpg unknown/

## Downloading Kaggle Data

Go ahead and run the following:

```
kg config -u farlion -p -c dogs-vs-cats-redux-kernels-edition
```


In [None]:
!kg download

Now we're placing them into the following structure

```
utils/
    vgg16.py
    utils.py
lesson1/
    redux.ipynb
    data/
        redux/
            train/
                cat.437.jpg
                dog.9924.jpg
                cat.1029.jpg
                dog.4374.jpg
            test/
                231.jpg
                325.jpg
                1235.jpg
                9923.jpg
```

In [None]:
!unzip -q test.zip
!unzip -q train.zip

In [None]:
#!rm test.zip train.zip

In [None]:
!mkdir -p data/redux
!mv test train data/redux/

## Action Plan
1. Create Validation and Sample sets
2. Rearrange image files into their respective directories 
3. Finetune and Train model
4. Generate predictions
5. Validate predictions
6. Submit predictions to Kaggle

## Create Validation and Sample sets

In [None]:
setup_standard_dir_structure()

In [None]:
move_test_images_to_unknown_category_for_easier_batching()

In [None]:
total_images_in_training_set = count_images_in_training_set()
print('We have {total} images in total in our training set.').format(total=str(total_images_in_training_set))

In [None]:
move_training_images_to_validation_set(percent=10)

In [None]:
copy_training_images_to_sample(percent=1)

In [None]:
copy_validation_images_to_sample(percent=0.25)

## Split up image files into class directories

In [None]:
dirs = ['{}/sample/train'.format(DATA_HOME_DIR), 
        '{}/sample/validation'.format(DATA_HOME_DIR),
        '{}/validation'.format(DATA_HOME_DIR),
        '{}/train'.format(DATA_HOME_DIR)]
classes = ['cats', 'dogs']
split_into_one_directory_per_class(dirs=dirs, classes=classes)

## Finetune and Train Model

In [None]:
%cd $DATA_HOME_DIR

vgg = Vgg16()

In [None]:
BATCH_SIZE = 64
EPOCHS = 3
LEARNING_RATE = 0.01

In [None]:
train_batches = vgg.get_batches(TRAIN_PATH, batch_size=BATCH_SIZE)
validation_batches = vgg.get_batches(VALIDATION_PATH, batch_size=BATCH_SIZE)

vgg.finetune(train_batches)

In [None]:
#Not sure if we set this for all fits
vgg.model.optimizer.lr = LEARNING_RATE

In [None]:
#For each epoch we test our model against the validation set
latest_weights_filename = None
for epoch in range(EPOCHS):
    print('Running epoch {}').format(epoch)
    vgg.fit(train_batches, validation_batches, nb_epoch=1)
    latest_weights_filename = 'ft{}.h5'.format(epoch)
    vgg.model.save_weights(RESULTS_PATH + latest_weights_filename)
print('Completed {} fit operations').format(EPOCHS)

## Generate Predictions

In [None]:
#For every image, vgg.test() generates two probabilities 
#based on how we've ordered the cats/dogs directories.
#It looks like column one is cats and column two is dogs
prediction_batches, predictions = vgg.test(TEST_PATH, batch_size=BATCH_SIZE)

In [None]:
print(predictions[:5])

prediction_filenames = prediction_batches.filenames
print(prediction_filenames[:5])

In [None]:
# You can verify the column ordering by viewing some images
Image.open(test_path + filenames[2])

In [None]:
#Save our test results arrays so we can use them again later
save_array(results_path + 'predictions.dat', preds)
save_array(results_path + 'prediction_filenames.dat', filenames)

## Validate Predictions

Keras' *fit()* function conveniently shows us the value of the loss function, and the accuracy, after every epoch ("*epoch*" refers to one full run through all training examples). The most important metrics for us to look at are for the validation set, since we want to check for over-fitting. 

- **Tip**: with our first model we should try to overfit before we start worrying about how to reduce over-fitting - there's no point even thinking about regularization, data augmentation, etc if you're still under-fitting! (We'll be looking at these techniques shortly).

As well as looking at the overall metrics, it's also a good idea to look at examples of each of:
1. A few correct labels at random
2. A few incorrect labels at random
3. The most correct labels of each class (ie those with highest probability that are correct)
4. The most incorrect labels of each class (ie those with highest probability that are incorrect)
5. The most uncertain labels (ie those with probability closest to 0.5).

Let's see what we can learn from these examples. (In general, this is a particularly useful technique for debugging problems in the model. However, since this model is so simple, there may not be too much to learn at this stage.)

Calculate predictions on validation set, so we can find correct and incorrect examples:

In [None]:
vgg.model.load_weights(RESULTS_PATH+latest_weights_filename)

In [None]:
validation_batches, validation_probabilities = vgg.test(VALIDATION_PATH, batch_size=BATCH_SIZE)
validation_filenames = validation_batches.filenames
expected_labels = validation_batches.classes # 0 or 1

#Round our predictions to 0/1 to generate labels
our_predictions = validation_probabilities[:,0]
our_labels = np.round(1-our_predictions)

In [None]:
#Helper function to plot images by index in the validation set 
#plots is a helper function in utils.py
def plots_idx(idx, titles=None):
    plots([image.load_img(valid_path + filenames[i]) for i in idx], titles=titles)
    
#Number of images to view for each visualization task
n_view = 4

In [None]:
#1. A few correct labels at random
correct = np.where(our_labels==expected_labels)[0]
print "Found %d correct labels" % len(correct)
idx = permutation(correct)[:n_view]
plots_idx(idx, our_predictions[idx])

In [None]:
#2. A few incorrect labels at random
incorrect = np.where(our_labels!=expected_labels)[0]
print "Found %d incorrect labels" % len(incorrect)
idx = permutation(incorrect)[:n_view]
plots_idx(idx, our_predictions[idx])

In [None]:
#3a. The images we most confident were cats, and are actually cats
correct_cats = np.where((our_labels==0) & (our_labels==expected_labels))[0]
print "Found %d confident correct cats labels" % len(correct_cats)
most_correct_cats = np.argsort(our_predictions[correct_cats])[::-1][:n_view]
plots_idx(correct_cats[most_correct_cats], our_predictions[correct_cats][most_correct_cats])

In [None]:
#3b. The images we most confident were dogs, and are actually dogs
correct_dogs = np.where((our_labels==1) & (our_labels==expected_labels))[0]
print "Found %d confident correct dogs labels" % len(correct_dogs)
most_correct_dogs = np.argsort(our_predictions[correct_dogs])[:n_view]
plots_idx(correct_dogs[most_correct_dogs], our_predictions[correct_dogs][most_correct_dogs])

In [None]:
#4a. The images we were most confident were cats, but are actually dogs
incorrect_cats = np.where((our_labels==0) & (our_labels!=expected_labels))[0]
print "Found %d incorrect cats" % len(incorrect_cats)
if len(incorrect_cats):
    most_incorrect_cats = np.argsort(our_predictions[incorrect_cats])[::-1][:n_view]
    plots_idx(incorrect_cats[most_incorrect_cats], our_predictions[incorrect_cats][most_incorrect_cats])

In [None]:
#4b. The images we were most confident were dogs, but are actually cats
incorrect_dogs = np.where((our_labels==1) & (our_labels!=expected_labels))[0]
print "Found %d incorrect dogs" % len(incorrect_dogs)
if len(incorrect_dogs):
    most_incorrect_dogs = np.argsort(our_predictions[incorrect_dogs])[:n_view]
    plots_idx(incorrect_dogs[most_incorrect_dogs], our_predictions[incorrect_dogs][most_incorrect_dogs])

In [None]:
#5. The most uncertain labels (ie those with probability closest to 0.5).
most_uncertain = np.argsort(np.abs(our_predictions-0.5))
plots_idx(most_uncertain[:n_view], our_predictions[most_uncertain])

Perhaps the most common way to analyze the result of a classification model is to use a [confusion matrix](http://www.dataschool.io/simple-guide-to-confusion-matrix-terminology/). Scikit-learn has a convenient function we can use for this purpose:

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(expected_labels, our_labels)

We can just print out the confusion matrix, or we can show a graphical view (which is mainly useful for dependents with a larger number of categories).

In [None]:
plot_confusion_matrix(cm, val_batches.class_indices)

## Submit Predictions to Kaggle

Here's the format Kaggle requires for new submissions:
```
imageId,isDog
1242, .3984
3947, .1000
4539, .9082
2345, .0000
```

Kaggle wants the imageId followed by the probability of the image being a dog. Kaggle uses a metric called [Log Loss](http://wiki.fast.ai/index.php/Log_Loss) to evaluate your submission.

In [None]:
#Load our test predictions from file
preds = load_array(results_path + 'test_preds.dat')
filenames = load_array(results_path + 'filenames.dat')

In [None]:
#Grab the dog prediction column
isdog = preds[:,1]
print "Raw Predictions: " + str(isdog[:5])
print "Mid Predictions: " + str(isdog[(isdog < .6) & (isdog > .4)])
print "Edge Predictions: " + str(isdog[(isdog == 1) | (isdog == 0)])

In [None]:
#So to play it safe, we use a sneaky trick to round down our edge predictions
#Swap all ones with .95 and all zeros with .05
isdog = isdog.clip(min=0.05, max=0.95)

In [None]:
#Extract imageIds from the filenames in our test/unknown directory 
filenames = batches.filenames
ids = np.array([int(f[8:f.find('.')]) for f in filenames])

Here we join the two columns into an array of [imageId, isDog]

In [None]:
subm = np.stack([ids,isdog], axis=1)
subm[:5]

In [None]:
%cd $DATA_HOME_DIR
submission_file_name = 'submission1.csv'
np.savetxt(submission_file_name, subm, fmt='%d,%.5f', header='id,label', comments='')

In [None]:
from IPython.display import FileLink
%cd $LESSON_HOME_DIR
FileLink('data/redux/'+submission_file_name)