In [5]:
import os
import random
import numpy as np
import matplotlib.pyplot as plt
import caffe
import itertools
import pandas
import cv2
import scipy

from os.path import join as pjoin
from math import ceil
from copy import copy
from collections import OrderedDict
from operator import itemgetter, attrgetter
from matplotlib import pyplot
from scipy import ndimage as ndimage

from caffe import layers as L
from caffe import params as P
import tools

from ntb.layer.data import Transformer
from ntb.db import *

%matplotlib inline
pyplot.rcParams['figure.figsize'] = (10, 6)

# set seed to make randomization reproducible
random.seed(0)
np.random.seed(0)

In [46]:
db = NTBDB()

In [74]:
class Network(object):
    def __init__(self, definition, weights, labels, transformer):
        self.net = caffe.Net(definition, weights, caffe.TEST)
        self.transformer = transformer
        self.labels = labels

    def get_scores(self, indexes, batch_size=256):
        data_layer = self.net.blobs['data']
        shape = self.net.blobs['data'].shape
        shape[0] = batch_size
        data_layer.reshape(*shape)

        def predict_batch(batch):
            for i, index in enumerate(batch):
                image = caffe.io.load_image(image_path(db.metadata[index]))
                self.net.blobs['data'].data[...][i] = self.transformer.preprocess(image)
            self.net.forward()

        def get_batch_scores(batch):
            res = []
            for i, index in enumerate(batch):
                row = list()
                row.insert(0, index)
                yield (index,) + tuple(self.net.blobs['score'].data[i])

        batch_num = int(ceil(indexes.size/float(batch_size)))
        scores = []
        print "Going to process", batch_num, "batches"
        for i, batch in enumerate(np.array_split(indexes, batch_num)):
            predict_batch(batch)
            scores.extend(get_batch_scores(batch))
            if i % 50 == 0:
                print "Batch", i, "finised"

        dtype = [(label, 'f4') for label in labels]
        dtype.insert(0, ('idx', 'S27'))
        return np.array(scores, dtype=dtype)

In [82]:
def get_curves(labels, scores, ground_truth):
    def get_label_curve(label):
        sorted_images = np.sort(scores, order=[label], axis=0)[['idx']][::-1].copy().view('S27')

        res = []
        predicted_true = 0
        true_positive = 0
        for index in sorted_images:
            predicted_true += 1
            if index in ground_truth[label]:
                true_positive += 1
            p = float(true_positive) / predicted_true
            r = float(true_positive) / len(ground_truth[label])
            res.append((p, r))
        return np.array(res, dtype=[('precision', 'f4'), ('recall', 'f4')])

    return {label: get_label_curve(label) for label in labels}

In [88]:
def get_average_precision(curves):
    recall_range = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
    res = {}
    for label, label_curve in curves.iteritems():
        s = 0
        for recall in recall_range:
            print label, recall
            s += label_curve[label_curve['recall'] >= recall]['precision'].max()
        res[label] = s / len(recall_range)
    return res

In [77]:
NETS_DIR = os.path.join("/storage/ntb/nets")
def get_scores_for_net(net_name, transformer, snapshot_num, batch_size=256):
    scores_file_path = pjoin(data_dir, 'scores_{}.np'.format(snapshot_num))
    try:
        with open(scores_file_path) as scores_file:
            return np.load(scores_file)
    except IOError:
        pass

    net_dir = pjoin(NETS_DIR, net_name)
    data_dir = pjoin(net_dir, 'data')
    labels = np.load(pjoin(data_dir, 'labels.np'))
    net = Network(
        pjoin(net_dir, 'testnet.prototxt'),
        pjoin(net_dir, 'snapshots', 'snapshot_iter_{}.caffemodel'.format(snapshot_num)),
        labels,
        transformer,
    )
    with open(pjoin(data_dir, 'test.pickle')) as test_file:
        test_data = pickle.load(test_file)
    assert set(test_data.keys()) == set(labels)
    indexes = np.unique(np.concatenate(test_data.values()))
    scores = net.get_scores(indexes, batch_size=batch_size)
    
    with open(scores_file_path, mode='w') as scores_file:
        scores.dump(scores_file)
    return scores

### Sigmoid loss function, no augmentation, raw dataset

In [None]:
scores = get_scores_for_net("ft_sigmoid_noaug", Transformer(shape=[227,227]), 853)

In [76]:
net_name = "ft_sigmoid_noaug"
net_dir = pjoin(NETS_DIR, net_name)
data_dir = pjoin(net_dir, 'data')
labels = np.load(pjoin(data_dir, 'labels.np'))

In [75]:
net = Network(
    pjoin(net_dir, 'testnet.prototxt'),
    pjoin(net_dir, 'snapshots', 'snapshot_iter_853.caffemodel'),
    labels,
    Transformer(shape=[227,227]),
)

In [43]:
with open(pjoin(data_dir, 'test.pickle')) as test_file:
    test_data = pickle.load(test_file)
assert set(test_data.keys()) == set(labels)

In [44]:
indexes = np.unique(np.concatenate(test_data.values()))

In [78]:
%time scores = net.get_scores(indexes)

Going to process 134 batches
Batch 0 finised
Batch 50 finised
Batch 100 finised
CPU times: user 1h 6min 49s, sys: 4min 19s, total: 1h 11min 8s
Wall time: 54min 35s


In [79]:
scores_file_path = pjoin(data_dir, 'scores_{}.np'.format(853))
with open(scores_file_path, mode='w') as scores_file:
    scores.dump(scores_file)

In [84]:
curves = get_curves(labels, scores, test_data)

In [85]:
scores.size

34245