In [None]:
from lib.compare import CompareModel, config_rank_comparator
from lib.data import load_info, sample_pairs, aggregate_subjects
from lib.validate import validate, plot_roc
from lib.submit import compare_all
from lib import metrics
from pathlib import Path
import pickle
import mxnet as mx
import numpy as np
from skimage import io
import cv2
from tqdm import tqdm
import typing as t
from functools import partial
from matplotlib import pyplot as plt
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn import cluster, preprocessing
import optuna
from sklearn.neighbors import kneighbors_graph
from more_itertools import unzip

In [None]:
data_path = Path('/run/media/andrey/Fast/FairFace/fixed_data_wide/train/data')
csv_path = Path('data/train_df.csv')
data = load_info(data_path, csv_path)
exists = [idx for idx, cur_path in enumerate(data['img_path']) if cur_path.exists()]
data = data.iloc[np.array(exists)]
val_data_path = Path('/run/media/andrey/Fast/FairFace/fixed_data_wide/train/data')
val_csv_path = Path('data/wide_val.csv')
val_data = load_info(val_data_path, val_csv_path)
exists = [idx for idx, cur_path in enumerate(val_data['img_path']) if cur_path.exists()]
val_data = val_data.iloc[np.array(exists)]

In [None]:
num_sample = 2 * 10 ** 4
subject_dict = aggregate_subjects(val_data['TEMPLATE_ID'], val_data['SUBJECT_ID'])
sampled_pairs, sampled_labels = unzip(sample_pairs(subject_dict, num_sample))
sampled_pairs = list(sampled_pairs)
sampled_labels = np.array(list(sampled_labels))

In [None]:
experiments = ['arcface_ft_norm2', 'full_ft3', 'ultimate5', 'ultimate7', 'test_center_vgg']

In [None]:
comparators = {}
for cur_exp in tqdm(experiments):
    models_path = Path('experiments') / cur_exp / 'snapshots'
    last_epoch = len(list(models_path.iterdir())) - 1
    comparator = CompareModel(models_path / cur_exp, last_epoch, ctx=mx.gpu(0), use_flip=True)
    rank_comparator = config_rank_comparator(comparator, val_data['img_path'])
    comparators[cur_exp] = rank_comparator

In [6]:
def optuna_rank(trial):
    use_vector = np.zeros((len(experiments),), dtype=np.int32)
    for cur_idx, cur_experiment in enumerate(experiments):
        cur_use = trial.suggest_categorical(cur_experiment, [False, True])
        if cur_use:
            use_vector[cur_idx] = 1
    to_use = [cur_exp for cur_use, cur_exp in zip(use_vector, experiments) if cur_use > 0]
    result_ranks = []
    for cur_exp in to_use:
        rank_comparator = comparators[cur_exp]
        result_ranks.append(validate(rank_comparator, val_data_path, val_csv_path, num_sample=0,
                                     pairs=sampled_pairs, labels=sampled_labels)[1])
    merged_ranks = np.mean(result_ranks, axis=0)
    return roc_auc_score(sampled_labels, merged_ranks)

In [7]:
study = optuna.create_study(direction='maximize')
study.optimize(optuna_rank, n_trials=30, n_jobs=1, timeout=3600 * 3)

trial = study.best_trial

print('AUC: {}'.format(trial.value))
print("Best hyperparameters: {}".format(trial.params))

100%|██████████| 20000/20000 [00:01<00:00, 11062.83it/s]
100%|██████████| 20000/20000 [00:01<00:00, 11486.86it/s]
100%|██████████| 20000/20000 [00:01<00:00, 11507.79it/s]
[I 2020-06-29 11:35:57,589] Finished trial#0 with value: 0.992589255 with parameters: {'arcface_ft_norm2': True, 'full_ft3': False, 'ultimate5': False, 'ultimate7': True, 'test_center_vgg': True}. Best is trial#0 with value: 0.992589255.
100%|██████████| 20000/20000 [00:01<00:00, 11457.35it/s]
100%|██████████| 20000/20000 [00:01<00:00, 11337.74it/s]
100%|██████████| 20000/20000 [00:01<00:00, 11500.02it/s]
[I 2020-06-29 11:36:02,916] Finished trial#1 with value: 0.99084415 with parameters: {'arcface_ft_norm2': False, 'full_ft3': True, 'ultimate5': True, 'ultimate7': True, 'test_center_vgg': False}. Best is trial#0 with value: 0.992589255.
100%|██████████| 20000/20000 [00:01<00:00, 11446.16it/s]
[I 2020-06-29 11:36:04,732] Finished trial#2 with value: 0.988950085 with parameters: {'arcface_ft_norm2': True, 'full_ft3': F

100%|██████████| 20000/20000 [00:01<00:00, 10558.91it/s]
100%|██████████| 20000/20000 [00:01<00:00, 11181.73it/s]
[I 2020-06-29 11:37:47,914] Finished trial#28 with value: 0.9937646549999999 with parameters: {'arcface_ft_norm2': False, 'full_ft3': False, 'ultimate5': True, 'ultimate7': False, 'test_center_vgg': True}. Best is trial#16 with value: 0.9937646549999999.
100%|██████████| 20000/20000 [00:01<00:00, 11070.08it/s]
100%|██████████| 20000/20000 [00:01<00:00, 11405.61it/s]
100%|██████████| 20000/20000 [00:01<00:00, 11343.40it/s]
100%|██████████| 20000/20000 [00:01<00:00, 10167.33it/s]
[I 2020-06-29 11:37:55,288] Finished trial#29 with value: 0.992555505 with parameters: {'arcface_ft_norm2': True, 'full_ft3': False, 'ultimate5': True, 'ultimate7': True, 'test_center_vgg': True}. Best is trial#16 with value: 0.9937646549999999.


AUC: 0.9937646549999999
Best hyperparameters: {'arcface_ft_norm2': False, 'full_ft3': False, 'ultimate5': True, 'ultimate7': False, 'test_center_vgg': True}


In [None]:
from insightface import model_zoo
model_zoo.get_model('arcface_r100_v1')

In [None]:
comparator = CompareModel(ctx=mx.gpu(0))

In [None]:
res = [comparator(cur_p, cur_p) for cur_p in tqdm(val_data['img_path'])]

In [None]:
embeddings_dict = comparator.embeddings
embeddings = np.array([embeddings_dict[cur_p] for cur_p in val_data['img_path']])
embeddings_sqr = np.sum(embeddings ** 2, axis=-1, keepdims=True)
norm_embeddings = embeddings / np.sqrt(embeddings_sqr)

cosine_dist_matrix = np.maximum(0, 1 - norm_embeddings.dot(norm_embeddings.T))
euclidean_dist_matrix = np.maximum(0, embeddings_sqr + embeddings_sqr.T - 2 * embeddings.dot(embeddings.T))

In [None]:
eu_connectivity_3 = kneighbors_graph(embeddings, metric='euclidean', n_neighbors=3, include_self=False)
eu_connectivity_10 = kneighbors_graph(embeddings, metric='euclidean', n_neighbors=10, include_self=False)
eu_connectivity_30 = kneighbors_graph(embeddings, metric='euclidean', n_neighbors=30, include_self=False)
eu_connectivity_100 = kneighbors_graph(embeddings, metric='euclidean', n_neighbors=100, include_self=False)
eu_connectivity = (eu_connectivity_3, eu_connectivity_10, eu_connectivity_30, eu_connectivity_100)

co_connectivity_3 = kneighbors_graph(embeddings, metric='cosine', n_neighbors=3, include_self=False)
co_connectivity_10 = kneighbors_graph(embeddings, metric='cosine', n_neighbors=10, include_self=False)
co_connectivity_30 = kneighbors_graph(embeddings, metric='cosine', n_neighbors=30, include_self=False)
co_connectivity_100 = kneighbors_graph(embeddings, metric='cosine', n_neighbors=100, include_self=False)
co_connectivity = (co_connectivity_3, co_connectivity_10, co_connectivity_30, co_connectivity_100)

In [None]:
NO_LABEL = -1


def calc_closest_k(embs, k: int = 2, batch_size: int = 10 ** 3, use_gpu: bool = True):
    closest_vals = np.empty((embs.shape[0], k), dtype=np.float32)
    closest_idx = np.empty((embs.shape[0], k), dtype=np.int32)
    if use_gpu:
        ctx = mx.gpu(0)
    else:
        ctx = mx.cpu()
    mx_embeddings = mx.nd.array(embs.astype(np.float32), ctx=ctx)
    mx_embeddings = mx_embeddings / mx.nd.sqrt(mx.nd.sum(mx_embeddings ** 2, axis=-1, keepdims=True))
    for bg in tqdm(range(0, embs.shape[0], batch_size)):
        ed = min(embs.shape[0], bg + batch_size)
        cur_embs = mx_embeddings[bg:ed]
        cur_dist = 1 - mx.nd.dot(cur_embs, mx_embeddings.T)
        cur_vals, cur_idx = mx.nd.topk(cur_dist, k=k, axis=-1, ret_typ='both', dtype='int32', is_ascend=True)
        closest_vals[bg:ed] = cur_vals.asnumpy()
        closest_idx[bg:ed] = cur_idx.asnumpy()
    return closest_vals, closest_idx


def label_embeddings(embs, k: int = 3, threshold: float = 0.3, use_gpu: bool = True):
    labels = np.zeros((embs.shape[0],), dtype=np.int32) + NO_LABEL
    cur_label = 0
    closest_vals, closest_idx = calc_closest_k(embs, k=k, use_gpu=use_gpu)
    for cur_idx, (cur_dists, cur_closest) in tqdm(enumerate(zip(closest_vals, closest_idx))):
        if labels[cur_idx] != NO_LABEL:
            continue
        to_label = cur_closest[cur_dists < threshold]
        if np.all(labels[to_label]) != NO_LABEL:
            search_labels = np.unique(labels[to_label])
            more_to_label = [to_label]
            for cur_l in search_labels:
                if cur_l == NO_LABEL:
                    continue
                more_to_label.append(np.where(labels == cur_l)[0])
            to_label = np.concatenate(more_to_label, axis=0)
        labels[to_label] = cur_label
        cur_label += 1
    return labels


def cluster_sklearn(embs, algorithm, dist_matrix=None):
    if dist_matrix is None:
        algorithm.fit(embs)
    else:
        algorithm.fit(dist_matrix)
    return algorithm.labels_


def get_medians(embs, labels, norm_median: bool = False):
    per_label = {}
    for cur_emb, cur_label in zip(embs, labels):
        if cur_label not in per_label:
            per_label[cur_label] = []
        per_label[cur_label].append(cur_emb)
    medias = {}
    for cur_label, cur_embs in per_label.items():
        if cur_label == NO_LABEL:
            continue
        median = np.mean(cur_embs, axis=0)
        if norm_median:
            median = median / np.sqrt(np.sum(median ** 2, axis=-1, keepdims=True))
        medias[cur_label] = median
    ret_medians = np.empty(embs.shape, dtype=np.float32)
    for cur_idx, cur_label in enumerate(labels):
        if cur_label == NO_LABEL:
            ret_medians[cur_idx] = embs[cur_idx]
        else:
            ret_medians[cur_idx] = medias[cur_label]
    return ret_medians


def config_median_comparator(comparator, label_method, all_paths: t.List[Path], metric, norm_median: bool = False, median_alpha: float = 1.0):

#     print('Preparing embeddings')
    res = [comparator(cur_p, cur_p) for cur_p in (all_paths)]
    embeddings_dict = comparator.embeddings
    embeddings = np.array([embeddings_dict[cur_p] for cur_p in all_paths])
    path_idx = {path: idx for idx, path in enumerate(all_paths)}
#     print('Getting medians')
    labels = label_method(embeddings)
    medians = get_medians(embeddings, labels, norm_median)
    # embeddings = preprocessing.normalize(embeddings)
    # medians = preprocessing.normalize(medians)
    
#     print('Done configurating')

    def compare(left_path: Path, right_path: Path):
        left_embedding = embeddings[path_idx[left_path]]
        left_median = medians[path_idx[left_path]]
        right_embedding = embeddings[path_idx[right_path]]
        right_median = medians[path_idx[right_path]]
        left_comp = left_median * median_alpha + left_embedding * (1 - median_alpha)
        right_comp = right_median * median_alpha + right_embedding * (1 - median_alpha)
        return metric(left_comp, right_comp)
    
    return compare


def comp_validate(comparator):
    preds = np.array(list(unzip(compare_all(val_data_path, sampled_pairs, comparator))[2]))
    auc = roc_auc_score(sampled_labels, preds)
    return auc


def optuna_cluster_objective(trial):
#     cluster_method = trial.suggest_categorical('cluster_method', ['DBSCAN', 'Agglomerative clustering'])
    cluster_method = 'Agglomerative clustering'
    cluster_metric = 'euclidean'  # trial.suggest_categorical('cluster_metric', ['cosine', 'euclidean'])
    if cluster_metric == 'cosine':
        eps = trial.suggest_uniform('distance_threshold', 0.1, 2.0)
        dist = cosine_dist_matrix
    else:
        eps = trial.suggest_uniform('distance_threshold', 50, 700)
        dist = euclidean_dist_matrix
    if cluster_method == 'DBSCAN':
        algorithm = cluster.DBSCAN(
            eps=eps,
            min_samples=trial.suggest_int('min_samples', 1, 10),
            metric='precomputed',
            n_jobs=1
        )
    elif cluster_method == 'OPTICS':
        algorithm = cluster.OPTICS(
            min_samples=trial.suggest_int('min_samples', 1, 10),
            max_eps=eps,
            metric='precomputed',
            cluster_method=trial.suggest_categorical('ins_cluster_method', ['xi', 'dbscan']),
            n_jobs=1
        )
    elif cluster_method == 'Agglomerative clustering':
        use_connectivity = True  # trial.suggest_categorical('use_connectivity', [False, True])
        if use_connectivity:
            num_k = trial.suggest_categorical('k_neighbors', [3, 10, 30, 100])
            idx = 0
            if num_k == 10:
                idx = 1
            elif num_k == 30:
                idx = 2
            else:
                idx = 3
            if cluster_metric == 'cosine':
                connectivity = co_connectivity[idx]
            else:
                connectivity = eu_connectivity[idx]
        else:
            connectivity = None
        dist = None
        posibility = ['complete', 'average', 'single']
        if cluster_metric == 'euclidean':
            posibility.append('ward')
        algorithm = cluster.AgglomerativeClustering(
            n_clusters=None,
            affinity=cluster_metric,  # 'precomputed',
            memory='/run/media/andrey/Data/cluster_cache/',
            connectivity=connectivity,
            linkage=trial.suggest_categorical('linkage', posibility),
            distance_threshold=eps
        )
    else:
        algorithm = None
    metric_str = trial.suggest_categorical('metric', ['cosine', 'euclidean'])
    if metric_str == 'cosine':
        metric = metrics.cosine
    else:
        metric = metrics.euclidean
    norm_median = False  # trial.suggest_categorical('norm_median', [False, True])
    median_alpha = trial.suggest_uniform('median_alpha', 0.0, 1.0)
    all_paths = val_data['img_path']
    median_comparator = config_median_comparator(comparator, partial(cluster_sklearn, algorithm=algorithm, dist_matrix=dist), all_paths, metric, norm_median, median_alpha)
    return comp_validate(median_comparator)


def optuna_objective(trial):
    metric_str = trial.suggest_categorical('metric', ['cosine', 'euclidean'])
    if metric_str == 'cosine':
        metric = metrics.cosine
    else:
        metric = metrics.euclidean
    k = trial.suggest_int('k_closest', 1, 1000)
    threshold = trial.suggest_uniform('threshold', 0.1, 1.5)
    norm_median = trial.suggest_categorical('norm_median', [False, True])
    median_alpha = trial.suggest_uniform('median_alpha', 0.0, 1.0)
    all_paths = val_data['img_path']
    median_comparator = config_median_comparator(comparator, partial(label_embeddings, k=k, threshold=threshold), all_paths, metric, norm_median, median_alpha)
    # labels, preds = validate(median_comparator, val_data_path, Path('wide_val.csv'), num_sample=10 ** 5)
    return comp_validate(median_comparator)


In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(optuna_cluster_objective, n_trials=100, n_jobs=1, timeout=3600)

trial = study.best_trial

print('AUC: {}'.format(trial.value))
print("Best hyperparameters: {}".format(trial.params))

In [None]:
# val_data_path = Path('./wide_faces_train')
# val_csv_path = Path('/content/drive/My Drive/colab/fair_face/val_df.csv')
# val_data = load_info(data_path, val_csv_path)

all_paths = val_data['img_path']

metric = metrics.cosine
k = 10
threshold = 0.3
norm_median = False
median_alpha = 1.0
median_comparator = config_median_comparator(comparator, partial(label_embeddings, k=k, threshold=threshold), all_paths, metric, norm_median, median_alpha)
# median_comparator2 = config_median_comparator(comparator, partial(label_embeddings, k=k, threshold=threshold), all_paths, metric, norm_median, median_alpha)

In [None]:
algorithm = cluster.AgglomerativeClustering(
            n_clusters=None,
            affinity='cosine',
            memory='/run/media/andrey/Data/cluster_cache/',
            linkage='complete',
            distance_threshold=0.9291274601315189
        )
norm_median = False
median_alpha = 0.5040577648719912
metric = metrics.euclidean
cluster_comparator_eu = config_median_comparator(comparator, partial(cluster_sklearn, algorithm=algorithm), all_paths, metric, norm_median, median_alpha)

metric = metrics.cosine
cluster_comparator_co = config_median_comparator(comparator, partial(cluster_sklearn, algorithm=algorithm), all_paths, metric, norm_median, median_alpha)

metric = lambda a, b: np.sum(a * b, -1)
cluster_comparator_my = config_median_comparator(comparator, partial(cluster_sklearn, algorithm=algorithm), all_paths, metric, norm_median, median_alpha)


In [None]:
val = partial(validate, data_dir=val_data_path, validation_csv=val_csv_path, num_sample=200000)
plot_roc([val(comparator), val(median_comparator), val(cluster_comparator_eu), val(cluster_comparator_co), val(cluster_comparator_my)], 
         ['basic', 'median', 'cluster_eu', 'cluster_co', 'cluster_my'])

In [None]:
val_data.to_csv('data/wide_val.csv')
val_data.head()