### Using FaceNet with distance metrics

Finetuning works well enough for a closed set problem, but won't for open set which is my eventual use case. From what I've seen the standard protocol for benchmarking is to perform classification or verification from the embeddings directly. The benchmarks I've seen are on a verification task. So evaluation involves labeling two photos as the same identity or not. In this case, a simple threshold for a similarity score (euclidean distance or cosine similarity depending on the loss in the original training).

So this notebook will look at how to use Euclidean distance with FaceNet for classification starting with the same closed set I used for finetuning.


In [1]:
from facenet_pytorch import MTCNN, InceptionResnetV1, fixed_image_standardization, training, extract_face
import torch
from torch.utils.data import DataLoader, SubsetRandomSampler, SequentialSampler
from torch import optim
from torch.optim.lr_scheduler import MultiStepLR
from torch.utils.tensorboard import SummaryWriter
from torchvision import datasets, transforms
import numpy as np
import os
import matplotlib.pyplot as plt
from PIL import Image, ImageDraw, ImageFont
import torch.nn.functional as F
from copy import deepcopy
import shutil
from sklearn.metrics import accuracy_score

In [2]:
data_dir = 'data/zack_erin'

batch_size = 16
epochs = 15
workers = 0 if os.name == 'nt' else 8

In [3]:
#Needed to rotate files based on exif. Default loader strips exif, so can't do in transform step. PIL files don't keep exif when made from MPO file

def exif_rotate_pil_loader(path):
    with open(path, 'rb') as f:
        image = Image.open(f)
        image = reorient_image(image)
        image = image.convert('RGB')  #replicates pil_loader from torchvision. Copies to convert to PIL
    return image


def reorient_image(im):
    try:
        image_exif = im._getexif()
        image_orientation = image_exif[274]
        if image_orientation in (2, '2'):
            return im.transpose(Image.FLIP_LEFT_RIGHT)
        elif image_orientation in (3, '3'):
            return im.transpose(Image.ROTATE_180)
        elif image_orientation in (4, '4'):
            return im.transpose(Image.FLIP_TOP_BOTTOM)
        elif image_orientation in (5, '5'):
            return im.transpose(Image.ROTATE_90).transpose(Image.FLIP_TOP_BOTTOM)
        elif image_orientation in (6, '6'):
            return im.transpose(Image.ROTATE_270)
        elif image_orientation in (7, '7'):
            return im.transpose(Image.ROTATE_270).transpose(Image.FLIP_TOP_BOTTOM)
        elif image_orientation in (8, '8'):
            return im.transpose(Image.ROTATE_90)
        else:
            return im
    except (KeyError, AttributeError, TypeError, IndexError):
        return im


def matplotlib_imshow(img, one_channel=False):
    if one_channel:
        img = img.mean(dim=0)
    img = img / 2 + 0.5  #unnormalize
    npimg = img.cpu().numpy()
    if one_channel:
        plt.imshow(npimg, cmap="Greys")
    else:
        plt.imshow(np.transpose(npimg, (1, 2, 0)))


def plot_classes_preds(preds, images, labels, classes, fig, save_path=None):
    '''
    Generates matplotlib Figure using a trained network, along with images
    and labels from a batch, that shows the network's top prediction along
    with its probability, alongside the actual label, coloring this
    information based on whether the prediction was correct or not.
    Uses the "images_to_probs" function.
    '''
    #plot the images in the batch, along with predicted and true labels
    #if not fig:
    fig = plt.figure(figsize=(20, 7))
    for idx in np.arange(len(preds)):
        ax = fig.add_subplot(2, np.ceil(len(preds)/2), idx+1, xticks=[], yticks=[])
        matplotlib_imshow(images[idx], one_channel=False)
        ax.set_title("{0}, {1:.1f}%\n(label: {2})".format(
            classes[preds[idx]],
            classes[labels[idx]]),
            color=("green" if preds[idx] == labels[idx].item() else "red"))
    if save_path:
        plt.savefig(save_path)
    else:
        return fig


In [4]:
class MTCNN_w_batch_extract(MTCNN):
    """custom class that includes addtional methods to allow for easier separation of detection,
        selection and extraction.
    """

    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    def select_boxes(self, all_boxes, all_probs, all_points, method='probability', threshold=.9):
        """Selects a single box from multiple for a given image using one of multiple heuristics.
        Arguments:
                batch_boxes {np.ndarray} -- Nx4 ndarray of bounding boxes for N detected faces (output from self.detect)
                batch_probs {list} -- Length N list of probalities for N detected faces. (output from self.detect)
        Keyword Arguments:
                method {str} -- Which heuristic to use for selection:
                    "probability": highest probability selected
                    "largest": largest box selected
                    "largest_over_theshold": largest box over a certain probability threshold selected
                threshold {float} -- theshold for "largest_over_threshold" method

        Returns:
                tuple(numpy.ndarray, numpy.ndarray) -- Ix4 ndarray of bounding boxes for I images. Ix0 array of probabilities for each box
        """
        selected_boxes, selected_probs, selected_points = [], [], []
        for boxes, points, probs in zip(all_boxes, all_points, all_probs):
            boxes = np.array(boxes)
            probs = np.array(probs)
            points = np.array(points)
            if len(boxes) == 0:
                selected_boxes.append(None)
                selected_probs.append([None])
                selected_points.append(None)
                continue
            elif method == 'largest':
                box_order = np.argsort((boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]))[::-1]
            elif method == 'probability':
                box_order = np.argsort(probs)[::-1]
            elif method == 'largest_over_threshold':
                box_mask = probs > threshold
                boxes = boxes[box_mask]
                box_order = np.argsort((boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]))[::-1]
                if sum(box_mask) == 0:
                    selected_boxes.append(None)
                    selected_probs.append([None])
                    selected_points.append(None)
                    continue

            box = boxes[box_order][[0]]
            prob = probs[box_order][[0]]
            point = points[box_order][[0]]
            selected_boxes.append(box)
            selected_probs.append(prob)
            selected_points.append(point)

        selected_boxes = np.array(selected_boxes)
        selected_probs = np.array(selected_probs)
        selected_points = np.array(selected_points)

        return selected_boxes, selected_probs, selected_points

    def extract(self, img, batch_boxes, save_path):
        #Determine if a batch or single image was passed
        batch_mode = True
        if not isinstance(img, (list, tuple)) and not (isinstance(img, np.ndarray) and len(img.shape) == 4):
            img = [img]
            batch_boxes = [batch_boxes]
            batch_mode = False

        #Parse save path(s)
        if save_path is not None:
            if isinstance(save_path, str):
                save_path = [save_path]
        else:
            save_path = [None for _ in range(len(img))]

        #Process all bounding boxes
        faces = []
        for im, box_im, path_im in zip(img, batch_boxes, save_path):
            if box_im is None:
                faces.append(None)
                continue

            if not self.keep_all:
                box_im = box_im[[0]]

            faces_im = []
            for i, box in enumerate(box_im):
                face_path = path_im
                if path_im is not None and i > 0:
                    save_name, ext = os.path.splitext(path_im)
                    face_path = save_name + '_' + str(i + 1) + ext

                face = extract_face(im, box, self.image_size, self.margin, face_path)
                if self.post_process:
                    face = fixed_image_standardization(face)
                faces_im.append(face)

            if self.keep_all:
                faces_im = torch.stack(faces_im)
            else:
                faces_im = faces_im[0]

            faces.append(faces_im)

        if not batch_mode:
            faces = faces[0]

        return faces


#### Determine if an nvidia GPU is available

In [5]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print('Running on device: {}'.format(device))

Running on device: cuda:0


In [6]:
mtcnn = MTCNN_w_batch_extract(
    image_size=160,
    margin=30,
    min_face_size=20,
    thresholds=[0.6, 0.7, 0.7],
    factor=0.709,
    post_process=True,
    select_largest=False,
    device=device
)

In [7]:
#Define the data loader for the input set of images
orig_img_ds = datasets.ImageFolder(data_dir, loader=exif_rotate_pil_loader, transform=transforms.Resize((1024, 1024)))

In [8]:

#overwrites class labels in dataset with path so path can be used for saving output in mtcnn batches
orig_img_ds.samples = [
    (p, p)
    for p, _ in orig_img_ds.samples
]

loader = DataLoader(
    orig_img_ds,
    num_workers=workers,
    batch_size=batch_size,
    collate_fn=training.collate_pil
)


In [9]:
boxes = []
box_probs = []
paths = []
for i, (x, b_paths) in enumerate(loader):
    crops = [p.replace(data_dir, data_dir + '_cropped') for p in b_paths]
    #for now, doing two forward passes. One for detection and one for extraction. #TODO: make custom MTCNN class that has option for both
    b_boxes, b_box_probs, points = mtcnn.detect(x, landmarks=True)
    b_boxes, b_box_probs, points = mtcnn.select_boxes(b_boxes, b_box_probs, points, method='largest_over_threshold')
    mtcnn.extract(x, b_boxes, save_path=crops)

    boxes.extend(b_boxes)
    box_probs.extend(b_box_probs)
    paths.extend(b_paths)

    print('\rBatch {} of {}'.format(i + 1, len(loader)), end='')


	nonzero()
Consider using one of the following signatures instead:
	nonzero(*, bool as_tuple) (Triggered internally at  /opt/conda/conda-bld/pytorch_1595629403081/work/torch/csrc/utils/python_arg_parser.cpp:766.)
  mask_inds = mask.nonzero()
  batch_boxes, batch_points = np.array(batch_boxes), np.array(batch_points)
  boxes = np.array(boxes)
  probs = np.array(probs)
  points = np.array(points)


Batch 12 of 12

In [10]:
#Remove mtcnn to reduce GPU memory usage
del mtcnn
torch.cuda.empty_cache()

In [11]:
#create dataset and data loaders from cropped images output from MTCNN

trans = transforms.Compose([
    np.float32,
    transforms.ToTensor(),
    fixed_image_standardization
])

#Training set can be much smaller because we aren't actually training, just creating a "template" in the gallery
dataset = datasets.ImageFolder(data_dir + '_cropped', transform=trans)
img_inds = np.arange(len(dataset))
np.random.shuffle(img_inds)
train_inds = img_inds[:int(0.3 * len(img_inds))]
val_inds = img_inds[int(0.3 * len(img_inds)):]

classes = dataset.classes

#no need to randomize. there will be only one epoch. Basically don't need the dataloader except for batch control
embed_loader = DataLoader(
    dataset,
    num_workers=workers,
    batch_size=batch_size,
    sampler=SequentialSampler(dataset)
)

In [12]:
#Load pretrained resnet model
resnet = InceptionResnetV1(
    classify=False,
    pretrained='vggface2'
).to(device)

In [13]:
classes = []
embeddings = []
with torch.no_grad():
    for xb, yb in embed_loader:
        xb = xb.to(device)
        b_embeddings = resnet(xb)
        b_embeddings = b_embeddings.to('cpu').numpy()
        classes.extend(yb.numpy())
        embeddings.extend(b_embeddings)


In [35]:
from sklearn.model_selection import cross_val_score, cross_val_predict, train_test_split
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.model_selection import StratifiedKFold


train_embeddings, val_embeddings, train_classes, val_classes = map(np.array, train_test_split(embeddings, classes))

#think this is basically the same as radius neighbors, but with majority requirement, but I'll see how similar they are
def threshold_nearest_neighbor(val_embedding, train_embeddings, train_classes, threshold, majority_req):
    distances = np.linalg.norm(train_embeddings - val_embedding, axis=1)
    neighbors = np.array(train_classes)[distances<threshold]
    if len(neighbors) == 0:
        return 2
    votes = np.unique(neighbors, return_counts=True)
    if majority_req and max(votes[1]) < int(len(neighbors/2)):
        return 2
    highest_votes = votes[0][np.argmax(votes[1])]
    return highest_votes

val_preds = []
for val_embedding in val_embeddings:
    val_preds.append(threshold_nearest_neighbor(val_embedding, train_embeddings, train_classes, 1.15, False))

accuracy = np.sum(np.equal(val_preds, val_classes))/len(val_classes)
print(accuracy)
accuracy_score(val_classes,val_preds)

0.851063829787234


0.851063829787234

In [33]:
cv = StratifiedKFold(5, shuffle=True)
neighbors = RadiusNeighborsClassifier(radius=1.15, outlier_label=2)
scores = cross_val_score(neighbors, embeddings, classes, cv=cv)
preds = cross_val_predict(neighbors, embeddings, classes, cv=cv)
print(scores)
print(preds)

[0.73684211 0.71052632 0.7027027  0.7027027  0.72972973]
[0 0 2 0 0 0 0 2 0 0 0 0 2 0 1 0 1 0 0 0 0 0 0 0 0 1 0 2 1 1 0 0 0 0 0 0 0
 0 0 1 1 1 0 0 2 0 1 0 0 0 0 1 1 0 0 0 0 0 2 0 0 0 0 0 0 1 0 0 0 0 0 0 0 2
 1 2 0 2 2 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 1 1 1 1 1 0 1 0 0 1 1 0 0 1 1 0
 1 0 1 0 1 1 0 1 1 1 1 1 1 0 1 0 1 1 1 1 1 1 1 1 1 0 0 1 1 1 0 0 1 1 1 1 1
 0 1 1 0 1 1 1 1 1 0 1 0 0 0 1 0 2 1 1 1 0 0 1 1 0 1 0 2 1 2 1 1 1 1 2 1 0
 1 0]


  ''.format(self.outlier_label_[k]))
  ''.format(self.outlier_label_[k]))
  ''.format(self.outlier_label_[k]))
  ''.format(self.outlier_label_[k]))
  ''.format(self.outlier_label_[k]))
  ''.format(self.outlier_label_[k]))
  ''.format(self.outlier_label_[k]))
  ''.format(self.outlier_label_[k]))
  ''.format(self.outlier_label_[k]))
  ''.format(self.outlier_label_[k]))


In [None]:
plot_classes_preds(val_preds,val_classes)

### Notes

Accuracy is pretty terrible compared to adding a FC softmax layer and finetuning. I'm going to try out on LFW to see if this is dataset related.
