# Determine NN of all samples used in pairs list of the Balance Faces in the Wild (BFW) dataset.

Uses the data in `data/bfw-datatable.pkl` to determine the NN. Saves the summary to `results/bfw-stats.csv`.

In [87]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import glob
from sklearn.metrics.pairwise import cosine_similarity

import pathlib
sns.set_style('whitegrid',
              {'font.family': 'serif', 'font.serif': 'Times New Roman',
               'fontsize'   : 12})

# Load out custom tool for loading and processing the data
from facebias.io import load_bfw_feature_lut
from facebias.preprocessing import encode_filepaths_to_labels, encode_filepaths_to_ethnicity_labels, encode_filepaths_to_gender_labels

## Load the data

Get all feature paths and store as dictionary with keys set as the relative file paths.

In [88]:
dir_features = pathlib.Path('../../data/features/resnet50/')
f_features = dir_features.joinpath('allfeatures.pkl')
if f_features.is_file():
    features = pd.read_pickle(str(f_features))
else:
    files_features = glob.glob(f'{str(dir_features)}/*males/n*/*.npy')
    features = load_bfw_feature_lut(files_features, d_root=dir_features)
    pd.to_pickle(features, str(f_features))
print(f"{len(features)} features loaded")


20000 features loaded


## Calculate similarity matrix.

Pass N features to cosine_similarity; returns an NxN matrix of scores between ith row and jth column.

In [89]:
tup_features = tuple(features.items())


In [90]:
score_matrix = cosine_similarity([f[1][0] for f in tup_features])

In [98]:
file_tags = [f[0] for f in tup_features]
labels_attribute = encode_filepaths_to_labels(file_tags)

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
tags = [str(pathlib.Path(f).parent.parent).split('_')[1] for f in file_tags]
le.fit(tags)
labels_ethnicity = le.transform(tags)

tags = [str(pathlib.Path(f).parent.parent).split('_')[0] for f in file_tags]
le.fit(tags)
labels_gender = le.transform(tags)


# labels_ethnicity = encode_filepaths_to_ethnicity_labels(file_tags)
# 
# labels_gender = encode_filepaths_to_gender_labels([f[0] for f in tup_features])

In [101]:

print(f"No. of Identities: {len(np.unique(labels_attribute))}\n"
      f"No. of Genders: {len(np.unique(labels_gender))}\nN"
      f"o. of Ethnics: {len(np.unique(labels_ethnicity))}\n")


No. of Identities: 800
No. of Genders: 4
No. of Ethnics: 2



In [None]:

# conf = np.array(list(score_matrix)).reshape((-1, int(np.sqrt(len(score_matrix)))))
# plt.imshow(score_matrix)
# plt.tight_layout()
sns.heatmap(score_matrix)

KeyboardInterrupt: 

In [None]:

colormap = sns.diverging_palette(220, 10, as_cmap=True)  # , cmap='coolwarm_r'
my_pal = sns.color_palette("Blues_r", 8)
my_pal[-1] = (1.0, 1.0, 1.0)
ax = plt.subplot()
sns.heatmap(df_conf, annot=True, linewidths=.1, square=True, cmap=my_pal,
            cbar_kws={'shrink': .4, 'ticks': [0.9, 0.925, 0.95, 0.975, 1.0]},
            linecolor='black', ax=ax, fmt='.3f', annot_kws={'size': 12},
            cbar=False)

sns.set_style({'xtick.bottom': True}, {'ytick.left': True})

# add the column names as labels
ax.set_yticklabels(df_conf.columns, rotation=0)
ax.set_xticklabels(df_conf.columns)
ax.axhline(y=0, color='k', linewidth=2)
ax.axhline(y=df_conf.shape[1], color='k', linewidth=2)
ax.axvline(x=0, color='k', linewidth=2)
ax.axvline(x=df_conf.shape[0], color='k', linewidth=2)
plt.tight_layout()
plt.savefig(CONFIGS.path.doutput + 'confusion.pdf', transparent=True)

