In [1]:
import pandas as pd
from pathlib import Path
import numpy as np
from typing import Iterable
from tqdm.auto import tqdm
import pickle
from scipy.spatial import distance
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve

In [3]:
FIW_FEATURES = Path("/Users/zkhan/Dropbox/rfiw2020-data/FIDs-features/")
validation_csv = pd.read_csv("/Users/zkhan/Dropbox/rfiw2020-data/trisubject_verification.v2/val/val_triples_competition_with_label.csv")
test_csv = pd.read_csv("/Users/zkhan/Dropbox/rfiw2020-data/trisubject_verification.v2/test/test_triples_reference.csv")

In [4]:
def read_features_from_iterable_of_pictures(iterable: Iterable[str], feature_dir: Path, feature_len: int = 512):
    """
    For each picture in the iterable, read the corresponding feature
    file from a directory of feature files.
    
    Parameters
    ------------
    iterable:
        An iterable of face image names.
    feature_dir:
        A Path to a directory containing features of faces, organized in
        the same way as FIW.
    feature_len:
        The size of the feature vector.
        
    Returns
    ------------
    A mxn matrix, where m is the number of images in the iterable, and n is
    the feature len.
    """
    dims = (len(iterable), feature_len)
    features = np.zeros(dims)
    for idx, img in enumerate(tqdm(iterable)):
        feature_file_name = (FIW_FEATURES / img).with_suffix(".pkl")
        with open(feature_file_name, "rb") as f:
            feature_vector = pickle.load(f)
        features[idx] = feature_vector
    return features

# Finding the best thresholds
We will use the mean of the cosine sim between (father, child) and (mother, child), then threshold it.

In [6]:
val_father_features = read_features_from_iterable_of_pictures(validation_csv.F.values, FIW_FEATURES)
val_mother_features = read_features_from_iterable_of_pictures(validation_csv.M.values, FIW_FEATURES)
val_child_features = read_features_from_iterable_of_pictures(validation_csv.C.values, FIW_FEATURES)

HBox(children=(FloatProgress(value=0.0, max=3568.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3568.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3568.0), HTML(value='')))




In [15]:
def combine_fmc_features(father_feats, mother_feats, child_feats):
    fc_cosine_sim = np.array([distance.cosine(u, v) for u, v in zip(father_feats, child_feats)]).reshape(-1, 1)
    mc_cosine_sim = np.array([distance.cosine(u, v) for u, v in zip(mother_feats, child_feats)]).reshape(-1, 1)
    fc_mc_cosine_sim = np.hstack((fc_cosine_sim, mc_cosine_sim))
    return np.mean(fc_mc_cosine_sim, axis=1)

In [17]:
val_scores = combine_fmc_features(val_father_features, val_mother_features, val_child_features)
val_labels = validation_csv.label.values.copy()

In [20]:
thresholds = np.arange(1, 0, step=-0.0125)
accuracy_scores = []
for thresh in tqdm(thresholds):
    accuracy_scores.append(accuracy_score(val_labels, val_scores > thresh))

accuracies = np.array(accuracy_scores)
max_accuracy = accuracies.max() 
max_accuracy_threshold =  thresholds[accuracies.argmax()]
print(f"Max accuracy: {max_accuracy}")
print(f"Max accuracy threshold: {max_accuracy_threshold}")

HBox(children=(FloatProgress(value=0.0, max=80.0), HTML(value='')))


Max accuracy: 0.5
Max accuracy threshold: 0.6125000000000014


The max accuracy is 0.5, and the threshold is 0.6125.

# Evaluation on test set

In [21]:
test_father_features = read_features_from_iterable_of_pictures(test_csv.father_img.values, FIW_FEATURES)
test_mother_features = read_features_from_iterable_of_pictures(test_csv.mother_img.values, FIW_FEATURES)
test_child_features = read_features_from_iterable_of_pictures(test_csv.child_img.values, FIW_FEATURES)

HBox(children=(FloatProgress(value=0.0, max=3470.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3470.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3470.0), HTML(value='')))




In [23]:
test_scores =  combine_fmc_features(test_father_features, test_mother_features, test_child_features)
test_labels = test_csv.label.values.copy()

In [31]:
test_csv["tag"] = test_csv["child_gender"].apply(lambda r: "FM-D" if r == "f" else "FM-S")
test_csv["pred"] = test_scores > max_accuracy_threshold

In [33]:
reltypes = test_csv.tag.unique()
accuracy_df = pd.DataFrame(columns=reltypes, dtype=float)
for rel in reltypes:
    y_true = test_csv[test_csv.tag == rel]["label"].values
    y_pred = test_csv[test_csv.tag == rel ]["pred"].values
    accuracy_df.loc[0, rel] = accuracy_score(y_true, y_pred)

In [36]:
accuracy_df.round(3)

Unnamed: 0,FM-D,FM-S
0,0.51,0.491
