In [6]:
%load_ext autoreload
%autoreload 2
from pathlib import Path
import numpy as np
import pandas as pd

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 0. Check LFW embeddings

In [43]:
from tqdm import tqdm

dataset_path = Path("/app/datasets/lfw/data_aligned_112_112")
img_names = np.array([str(x) for x in list(Path(dataset_path).rglob("*.jpg"))])
embs = np.load("/app/cache/features/scf_embs_lfw.npz")["embs"]

# load pairs
pairs_path = "/app/datasets/lfw/pairs_val_6000.txt"
pairs_label = []
pairs_score = []
with open(pairs_path) as fd:
    for line in tqdm(fd):
        pair_data = line[:-1].split("\t")
        # assert len(pair_data) == 3 or len(pair_data) == 4
        if len(pair_data) == 3:
            first_image_path = (
                Path(dataset_path)
                / pair_data[0]
                / f"{pair_data[0]}_{pair_data[1].zfill(4)}.jpg"
            )
            second_image_path = (
                Path(dataset_path)
                / pair_data[0]
                / f"{pair_data[0]}_{pair_data[2].zfill(4)}.jpg"
            )
            pairs_label.append(1)
        else:
            first_image_path = (
                Path(dataset_path)
                / pair_data[0]
                / f"{pair_data[0]}_{pair_data[1].zfill(4)}.jpg"
            )
            second_image_path = (
                Path(dataset_path)
                / pair_data[2]
                / f"{pair_data[2]}_{pair_data[3].zfill(4)}.jpg"
            )
            pairs_label.append(0)
        pairs_score.append(
            embs[img_names == str(first_image_path)][0]
            @ embs[img_names == str(second_image_path)][0]
        )
pairs_score = np.array(pairs_score)
pairs_label = np.array(pairs_label)

0it [00:00, ?it/s]

6000it [00:10, 583.32it/s]


In [54]:
from evaluation.visualize import draw_score_distr_plot

true_match_scores = pairs_score[pairs_label == 1]
wrong_match_scores = pairs_score[pairs_label == 0]
scores_distr = {
    "Истинная пара": true_match_scores,
    "Ложная пара": wrong_match_scores,
}
# draw_score_distr_plot(
#     scores_distr=scores_distr,
#     score_type="EVM",
#     model_name="EVM",
#     in_data_name="Истинная пара",
#     out_data_name="Ложная пара",
# )

## 1. Create LFW OSFR protocol

In [57]:
img_names = np.array([str(x) for x in list(Path(dataset_path).rglob("*.jpg"))])
person_names = np.array(
    [str(x).split("/")[-2] for x in list(Path(dataset_path).rglob("*.jpg"))]
)

In [60]:
unique_person_names, counts_person_names = np.unique(person_names, return_counts=True)

In [63]:
np.sum(counts_person_names > 3), np.sum(
    np.logical_or(counts_person_names == 3, counts_person_names == 2)
), np.sum(counts_person_names == 1)

(610, 1070, 4069)

In [101]:
# construct gallery and probe temlates
dataset_path = Path("/app/datasets/lfw/data_aligned_112_112")
image_path_to_template_id = {}
image_path_to_subject_id = {}
gallery_subjects = unique_person_names[counts_person_names > 3]
gallery_templates = []
known_probe_templates = []
subject_id = 0
gallery_template_id = 0
probe_template_id = 700
for subject in gallery_subjects:
    subject_dir = dataset_path / subject
    image_count = len(list(Path(subject_dir).glob("*.jpg")))
    subject_images_paths = []
    for i in range(image_count):
        image_path = str(subject_dir / f"{subject}_{str(i+1).zfill(4)}.jpg")
        image_path_to_subject_id[image_path] = subject_id
        if i < 3:
            image_path_to_template_id[image_path] = gallery_template_id
        if i >= 3:
            image_path_to_template_id[image_path] = probe_template_id
        subject_images_paths.append(image_path)

    gallery_templates.append(
        (subject_images_paths[:3], gallery_template_id, subject_id)
    )
    known_probe_templates.append(
        (subject_images_paths[3:], probe_template_id, subject_id)
    )
    gallery_template_id += 1
    probe_template_id += 1
    subject_id += 1

assert gallery_template_id < 700
unknown_probe_templates = []
probe_subjects = unique_person_names[counts_person_names <= 3]
for probe_subject in probe_subjects:
    probe_subject_dir = dataset_path / probe_subject
    probe_template_paths = []
    for image_path in list(Path(probe_subject_dir).glob("*.jpg")):
        image_path = str(image_path)
        image_path_to_subject_id[image_path] = subject_id
        image_path_to_template_id[image_path] = probe_template_id
        probe_template_paths.append(image_path)
    unknown_probe_templates.append(
        (probe_template_paths, probe_template_id, subject_id)
    )
    probe_template_id += 1
    subject_id += 1

In [102]:
assert len(image_path_to_template_id) == len(img_names)
assert len(image_path_to_subject_id) == len(img_names)
assert len(set(image_path_to_subject_id.values())) == len(unique_person_names)
assert len(set(image_path_to_template_id.values())) == len(unique_person_names) + 610

In [90]:
len(gallery_templates), len(known_probe_templates), len(unknown_probe_templates)

(610, 610, 5139)

In [107]:
# create meta files
# tid mid
identification_ds_path = Path("/app/datasets/lfw_ident")
identification_ds_path.mkdir(exist_ok=True)
meta_path = identification_ds_path / "meta"
meta_path.mkdir(exist_ok=True)
names = [x.split("/")[-1] for x in img_names]
mids = np.arange(len(img_names))
tids = []
sids = []

for image_path in img_names:
    tids.append(image_path_to_template_id[image_path])
    sids.append(image_path_to_subject_id[image_path])

out_file_tid_mid = meta_path / Path("lfw_face_tid_mid.txt")
with open(out_file_tid_mid, "w") as fd:
    for name, tid, sid, mid in zip(names, tids, sids, mids):
        fd.write(f"{name} {tid} {mid} {sid}\n")

out_file_probe = meta_path / Path("lfw_1N_probe_mixed.csv")
out_file_gallery = meta_path / Path("lfw_1N_gallery_G1.csv")

tids_probe = []
sids_probe = []
names_probe = []
for probe_meta in known_probe_templates + unknown_probe_templates:
    tids_probe.extend([probe_meta[1]] * len(probe_meta[0]))
    sids_probe.extend([probe_meta[2]] * len(probe_meta[0]))
    names_probe.extend([x.split("/")[-1] for x in probe_meta[0]])

tids_gallery = []
sids_gallery = []
names_gallery = []

for gallery_meta in gallery_templates:
    tids_gallery.extend([gallery_meta[1]] * len(gallery_meta[0]))
    sids_gallery.extend([gallery_meta[2]] * len(gallery_meta[0]))
    names_gallery.extend([x.split("/")[-1] for x in gallery_meta[0]])

assert len(tids_gallery) + len(tids_probe) == len(img_names)
probe = pd.DataFrame(
    {
        "TEMPLATE_ID": tids_probe,
        "SUBJECT_ID": sids_probe,
        "FILENAME": names_probe,
    }
)
gallery = pd.DataFrame(
    {
        "TEMPLATE_ID": tids_gallery,
        "SUBJECT_ID": sids_gallery,
        "FILENAME": names_gallery,
    }
)

probe.to_csv(out_file_probe, sep=",", index=False)
gallery.to_csv(out_file_gallery, sep=",", index=False)

In [75]:
len(gallery_templates), len(known_probe_templates), len(unknown_probe_templates)

(610, 610, 5139)