# Prepare datatable used to benchmark the Balance Faces in the Wild (BFW) dataset.

Load table in `data/bfw-datatable.pkl` to extract all features and store in the datatable. Overwrites the table to `data/bfw-datatable.pkl`.

## Add project code to PYTHONPATH, if not already there
Check that _path_package_ is set to _code_ directory on respective system

In [1]:
import pandas as pd
import pathlib
from sklearn.preprocessing import LabelEncoder
%matplotlib inline
%reload_ext autoreload
%autoreload 2
version_bfw="0.1.5"
dir_meta = f'../../data/bfw/meta/'

## Load list of pairs

Load lists of pairs, with _p1_ and _p2_ representing samples for either pair, _label_ is set as 1 if _genuine_; else, 0 for _imposter_, and the fold. Note there is no overlap in subjects between folds. Finally, the experiments are 5-fold (hence, _fold_ $\in$ {1, 2, 3, 4, 5}

In [2]:
fin = f'{dir_meta}bfw-v{version_bfw}-pairs.csv'
data = pd.read_csv(fin)
data.head()

Unnamed: 0,fold,p1,p2,label
0,1,asian_females/n000009/0010_01.jpg,asian_females/n000009/0043_01.jpg,1
1,1,asian_females/n000009/0010_01.jpg,asian_females/n000009/0120_01.jpg,1
2,1,asian_females/n000009/0010_01.jpg,asian_females/n000009/0122_02.jpg,1
3,1,asian_females/n000009/0010_01.jpg,asian_females/n000009/0188_01.jpg,1
4,1,asian_females/n000009/0010_01.jpg,asian_females/n000009/0205_01.jpg,1


## Display stats
For each fold, how many _genuine_ (i.e., 1) and _imposter_ (i.e., 0)

In [3]:
dfcounts = pd.DataFrame(data.groupby(by=['fold', 'label']).count()[['p1']])
dfcounts.columns = ['stats']
dfcounts.head(len(dfcounts))

Unnamed: 0_level_0,Unnamed: 1_level_0,stats
fold,label,Unnamed: 2_level_1
1,0,136194
1,1,48514
2,0,136185
2,1,48469
3,0,136284
3,1,48527
4,0,136242
4,1,48512
5,0,136474
5,1,48497


In [4]:
del dfcounts


## Add metadata to table, set format appropriately each step
There is rich information in the pairs list, for which preparing the datatable will be convenient later

Set attributes, IDs (str), and abbreviated variants of attribute, gender, and ethnicity for both _p1_ and _p2_

In [5]:
data['att1'] = data.p1.apply(lambda x: x.split('/')[0]).astype('category')
data['att2'] = data.p2.apply(lambda x: x.split('/')[0]).astype('category')


data['e1'] = data.att1.apply(lambda x: x.split('_')[0][0].upper())
data['e2'] = data.att2.apply(lambda x: x.split('_')[0][0].upper())

data['g1'] = data.att1.apply(lambda x: x.split('_')[1][0].upper())
data['g2'] = data.att2.apply(lambda x: x.split('_')[1][0].upper())

data['a1'] = (data['e1'] + data['g1']).astype('category')
data['a2'] = (data['e2'] + data['g2']).astype('category')

data['e1'] = data['e1'].astype('category')
data['e2'] = data['e2'].astype('category')
data['g1'] = data['g1'].astype('category')
data['g2'] = data['g2'].astype('category')
data['score'] = pd.np.nan
data.head()

Unnamed: 0,fold,p1,p2,label,att1,att2,e1,e2,g1,g2,a1,a2,score
0,1,asian_females/n000009/0010_01.jpg,asian_females/n000009/0043_01.jpg,1,asian_females,asian_females,A,A,F,F,AF,AF,
1,1,asian_females/n000009/0010_01.jpg,asian_females/n000009/0120_01.jpg,1,asian_females,asian_females,A,A,F,F,AF,AF,
2,1,asian_females/n000009/0010_01.jpg,asian_females/n000009/0122_02.jpg,1,asian_females,asian_females,A,A,F,F,AF,AF,
3,1,asian_females/n000009/0010_01.jpg,asian_females/n000009/0188_01.jpg,1,asian_females,asian_females,A,A,F,F,AF,AF,
4,1,asian_females/n000009/0010_01.jpg,asian_females/n000009/0205_01.jpg,1,asian_females,asian_females,A,A,F,F,AF,AF,


### assign unique ID tags per subject 
Encode N subjects as 0, 1, ...., N - 1

In [6]:
le = LabelEncoder()


subject_names = list(set(["/".join(p1.split('/')[:-1]) for p1 in data['p1'].unique()] + ["/".join(p2.split('/')[:-1]) for p2 in data['p2'].unique()]))
le.fit(subject_names)

data['ids1'] = le.transform(data['p1'].apply(lambda x: "/".join(x.split('/')[:-1])))
data['ids2'] = le.transform(data['p2'].apply(lambda x: "/".join(x.split('/')[:-1])))
data.head()

Unnamed: 0,fold,p1,p2,label,att1,att2,e1,e2,g1,g2,a1,a2,score,ids1,ids2
0,1,asian_females/n000009/0010_01.jpg,asian_females/n000009/0043_01.jpg,1,asian_females,asian_females,A,A,F,F,AF,AF,,0,0
1,1,asian_females/n000009/0010_01.jpg,asian_females/n000009/0120_01.jpg,1,asian_females,asian_females,A,A,F,F,AF,AF,,0,0
2,1,asian_females/n000009/0010_01.jpg,asian_females/n000009/0122_02.jpg,1,asian_females,asian_females,A,A,F,F,AF,AF,,0,0
3,1,asian_females/n000009/0010_01.jpg,asian_females/n000009/0188_01.jpg,1,asian_females,asian_females,A,A,F,F,AF,AF,,0,0
4,1,asian_females/n000009/0010_01.jpg,asian_females/n000009/0205_01.jpg,1,asian_females,asian_females,A,A,F,F,AF,AF,,0,0


In [7]:
data.sample(10)

Unnamed: 0,fold,p1,p2,label,att1,att2,e1,e2,g1,g2,a1,a2,score,ids1,ids2
834651,5,white_females/n003324/0328_01.jpg,indian_males/n006363/0494_01.jpg,0,white_females,indian_males,W,I,F,M,WF,IM,,639,542
680920,3,black_males/n004824/0124_01.jpg,asian_females/n000143/0002_01.jpg,0,black_males,asian_females,B,A,M,F,BM,AF,,354,5
274745,3,indian_females/n001026/0180_01.jpg,indian_females/n007080/0096_01.jpg,0,indian_females,indian_females,I,I,F,F,IF,IF,,405,460
831832,4,white_females/n007006/0042_01.jpg,white_males/n003125/0096_01.jpg,0,white_females,white_males,W,W,F,M,WF,WM,,684,736
697953,1,indian_females/n006723/0036_02.jpg,indian_females/n008040/0100_01.jpg,0,indian_females,indian_females,I,I,F,F,IF,IF,,452,485
38478,2,asian_females/n006203/0009_04.jpg,asian_females/n009125/0321_01.jpg,0,asian_females,asian_females,A,A,F,F,AF,AF,,55,87
222441,4,black_males/n001036/0135_01.jpg,black_males/n001621/0161_02.jpg,0,black_males,black_males,B,B,M,M,BM,BM,,311,314
793616,3,indian_males/n003664/0046_01.jpg,indian_females/n001025/0393_01.jpg,0,indian_males,indian_females,I,I,M,F,IM,IF,,521,404
911181,3,white_males/n006985/0056_01.jpg,indian_females/n006724/0034_01.jpg,0,white_males,indian_females,W,I,M,F,WM,IF,,775,453
640357,1,black_males/n004605/0102_01.jpg,white_females/n004322/0079_02.jpg,0,black_males,white_females,B,W,M,F,BM,WF,,351,652


## Save datatable
if file does not exist, write to disc

In [8]:
fout = f'{dir_meta}/bfw-{version_bfw}-datatable.pkl'
if not pathlib.Path().is_file():
    data.to_pickle(fout)