# Prepare datatable used to benchmark the Balance Faces in the Wild (BFW) dataset.

Load table in `data/bfw-datatable.pkl` to extract all features and store in the datatable. Overwrites the table to `data/bfw-datatable.pkl`.

## Add project code to PYTHONPATH, if not already there
Check that _path_package_ is set to _code_ directory on respective system

In [11]:
import pandas as pd
import numpy as np
import pathlib
from sklearn.preprocessing import LabelEncoder
%matplotlib inline
%reload_ext autoreload
%autoreload 2
version_bfw="0.1.5"
dir_meta = f'../../data/meta/'

## Load list of pairs

Load lists of pairs, with _p1_ and _p2_ representing samples for either pair, _label_ is set as 1 if _genuine_; else, 0 for _imposter_, and the fold. Note there is no overlap in subjects between folds. Finally, the experiments are 5-fold (hence, _fold_ $\in$ {1, 2, 3, 4, 5}

In [6]:
fin = f'{dir_meta}bfw-v{version_bfw}-datatable.csv'
data = pd.read_csv(fin)
data.head()

Unnamed: 0,fold,p1,p2,label,id1,id2,att1,att2,vgg16,resnet50,senet50,a1,a2,g1,g2,e1,e2
0,1,asian_females/n000009/0010_01.jpg,asian_females/n000009/0043_01.jpg,1,0,0,asian_females,asian_females,0.820039,0.703258,0.679089,AF,AF,F,F,A,A
1,1,asian_females/n000009/0010_01.jpg,asian_females/n000009/0120_01.jpg,1,0,0,asian_females,asian_females,0.719199,0.523613,0.594268,AF,AF,F,F,A,A
2,1,asian_females/n000009/0010_01.jpg,asian_females/n000009/0122_02.jpg,1,0,0,asian_females,asian_females,0.732029,0.527567,0.64368,AF,AF,F,F,A,A
3,1,asian_females/n000009/0010_01.jpg,asian_females/n000009/0188_01.jpg,1,0,0,asian_females,asian_females,0.607093,0.348211,0.458883,AF,AF,F,F,A,A
4,1,asian_females/n000009/0010_01.jpg,asian_females/n000009/0205_01.jpg,1,0,0,asian_females,asian_females,0.629153,0.384273,0.494913,AF,AF,F,F,A,A


## Display stats
For each fold, how many _genuine_ (i.e., 1) and _imposter_ (i.e., 0)

In [7]:
dfcounts = pd.DataFrame(data.groupby(by=['fold', 'label']).count()[['p1']])
dfcounts.columns = ['stats']
dfcounts.head(len(dfcounts))

Unnamed: 0_level_0,Unnamed: 1_level_0,stats
fold,label,Unnamed: 2_level_1
1,0,136194
1,1,48514
2,0,136185
2,1,48469
3,0,136284
3,1,48527
4,0,136242
4,1,48512
5,0,136474
5,1,48497


In [8]:
del dfcounts


## Add metadata to table, set format appropriately each step
There is rich information in the pairs list, for which preparing the datatable will be convenient later

Set attributes, IDs (str), and abbreviated variants of attribute, gender, and ethnicity for both _p1_ and _p2_

In [12]:
data['att1'] = data.p1.apply(lambda x: x.split('/')[0]).astype('category')
data['att2'] = data.p2.apply(lambda x: x.split('/')[0]).astype('category')


data['e1'] = data.att1.apply(lambda x: x.split('_')[0][0].upper())
data['e2'] = data.att2.apply(lambda x: x.split('_')[0][0].upper())

data['g1'] = data.att1.apply(lambda x: x.split('_')[1][0].upper())
data['g2'] = data.att2.apply(lambda x: x.split('_')[1][0].upper())

data['a1'] = (data['e1'] + data['g1']).astype('category')
data['a2'] = (data['e2'] + data['g2']).astype('category')

data['e1'] = data['e1'].astype('category')
data['e2'] = data['e2'].astype('category')
data['g1'] = data['g1'].astype('category')
data['g2'] = data['g2'].astype('category')
data['score'] = np.nan
data.head()

Unnamed: 0,fold,p1,p2,label,id1,id2,att1,att2,vgg16,resnet50,senet50,a1,a2,g1,g2,e1,e2,score
0,1,asian_females/n000009/0010_01.jpg,asian_females/n000009/0043_01.jpg,1,0,0,asian_females,asian_females,0.820039,0.703258,0.679089,AF,AF,F,F,A,A,
1,1,asian_females/n000009/0010_01.jpg,asian_females/n000009/0120_01.jpg,1,0,0,asian_females,asian_females,0.719199,0.523613,0.594268,AF,AF,F,F,A,A,
2,1,asian_females/n000009/0010_01.jpg,asian_females/n000009/0122_02.jpg,1,0,0,asian_females,asian_females,0.732029,0.527567,0.64368,AF,AF,F,F,A,A,
3,1,asian_females/n000009/0010_01.jpg,asian_females/n000009/0188_01.jpg,1,0,0,asian_females,asian_females,0.607093,0.348211,0.458883,AF,AF,F,F,A,A,
4,1,asian_females/n000009/0010_01.jpg,asian_females/n000009/0205_01.jpg,1,0,0,asian_females,asian_females,0.629153,0.384273,0.494913,AF,AF,F,F,A,A,


### assign unique ID tags per subject 
Encode N subjects as 0, 1, ...., N - 1

In [13]:
le = LabelEncoder()


subject_names = list(set(["/".join(p1.split('/')[:-1]) for p1 in data['p1'].unique()] + ["/".join(p2.split('/')[:-1]) for p2 in data['p2'].unique()]))
le.fit(subject_names)

data['ids1'] = le.transform(data['p1'].apply(lambda x: "/".join(x.split('/')[:-1])))
data['ids2'] = le.transform(data['p2'].apply(lambda x: "/".join(x.split('/')[:-1])))
data.head()

Unnamed: 0,fold,p1,p2,label,id1,id2,att1,att2,vgg16,resnet50,senet50,a1,a2,g1,g2,e1,e2,score,ids1,ids2
0,1,asian_females/n000009/0010_01.jpg,asian_females/n000009/0043_01.jpg,1,0,0,asian_females,asian_females,0.820039,0.703258,0.679089,AF,AF,F,F,A,A,,0,0
1,1,asian_females/n000009/0010_01.jpg,asian_females/n000009/0120_01.jpg,1,0,0,asian_females,asian_females,0.719199,0.523613,0.594268,AF,AF,F,F,A,A,,0,0
2,1,asian_females/n000009/0010_01.jpg,asian_females/n000009/0122_02.jpg,1,0,0,asian_females,asian_females,0.732029,0.527567,0.64368,AF,AF,F,F,A,A,,0,0
3,1,asian_females/n000009/0010_01.jpg,asian_females/n000009/0188_01.jpg,1,0,0,asian_females,asian_females,0.607093,0.348211,0.458883,AF,AF,F,F,A,A,,0,0
4,1,asian_females/n000009/0010_01.jpg,asian_females/n000009/0205_01.jpg,1,0,0,asian_females,asian_females,0.629153,0.384273,0.494913,AF,AF,F,F,A,A,,0,0


In [14]:
data.sample(10)

Unnamed: 0,fold,p1,p2,label,id1,id2,att1,att2,vgg16,resnet50,senet50,a1,a2,g1,g2,e1,e2,score,ids1,ids2
852249,3,white_females/n003990/0078_05.jpg,asian_males/n003482/0311_08.jpg,0,648,126,white_females,asian_males,0.473782,0.200894,0.196996,WF,AM,F,M,W,A,,648,126
404189,5,white_females/n004186/0111_01.jpg,white_females/n006925/0190_03.jpg,0,650,683,white_females,white_females,0.522859,0.280298,0.351948,WF,WF,F,F,W,W,,650,683
204528,5,black_males/n009112/1085_01.jpg,black_males/n009112/0271_01.jpg,1,397,397,black_males,black_males,0.77351,0.626262,0.727041,BM,BM,M,M,B,B,,397,397
226456,4,black_males/n008905/0123_01.jpg,black_males/n003036/0102_01.jpg,0,389,340,black_males,black_males,0.809244,0.464509,0.497367,BM,BM,M,M,B,B,,389,340
672110,1,black_males/n008582/0192_01.jpg,black_females/n005185/0130_01.jpg,0,385,285,black_males,black_females,0.6363,0.191801,0.225399,BM,BF,M,F,B,B,,385,285
884553,4,white_males/n000509/0101_01.jpg,asian_males/n008631/0364_01.jpg,0,703,184,white_males,asian_males,0.442637,0.167413,0.267777,WM,AM,M,M,W,A,,703,184
68911,2,asian_males/n008033/0344_02.jpg,asian_males/n008033/0444_01.jpg,1,172,172,asian_males,asian_males,0.576889,0.602189,0.651252,AM,AM,M,M,A,A,,172,172
326113,1,indian_males/n008375/0061_01.jpg,indian_males/n009056/0436_07.jpg,0,585,598,indian_males,indian_males,0.460078,0.312519,0.315716,IM,IM,M,M,I,I,,585,598
376278,5,white_females/n004844/0190_01.jpg,white_females/n004844/0301_02.jpg,1,667,667,white_females,white_females,0.730588,0.635187,0.715491,WF,WF,F,F,W,W,,667,667
469333,1,asian_females/n006222/0170_02.jpg,asian_females/n006148/0165_01.jpg,0,56,52,asian_females,asian_females,0.643578,0.290027,0.359796,AF,AF,F,F,A,A,,56,52


## Save datatable
if file does not exist, write to disc

In [25]:
fout = pathlib.Path(f'{dir_meta}/bfw-v{version_bfw}-datatable.pkl')
if not fout.exists():
    print("saving datatable")
    data.to_pickle(fout)

saving datatable
