In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import os
import fastai
from fastai import *
from fastai.vision import *

In [3]:
MAX_IMGS_PER_IDNT = 50
MAX_PAIRS_FOR_IDNT_PER_CLS = int(MAX_IMGS_PER_IDNT/2)

Dataset bellow were collected by performing a walk through original VVGFace2 dataset and selecting images with width between 200 and 300

# explore valid dataset

In [4]:
valid_df = pd.read_csv('dataset/valid.csv')
valid_df.columns = ['img_path', 'identity', 'width', 'hight']
valid_df.describe()

Unnamed: 0,width,hight
count,41323.0,41323.0
mean,245.787745,271.245553
std,29.372765,38.141079
min,200.0,145.0
25%,220.0,242.0
50%,244.0,267.0
75%,270.0,297.0
max,300.0,680.0


In [5]:
valid_df.head()

Unnamed: 0,img_path,identity,width,hight
0,n000001/0014_01.jpg,n000001,263,274
1,n000001/0018_01.jpg,n000001,272,283
2,n000001/0020_01.jpg,n000001,283,292
3,n000001/0021_01.jpg,n000001,265,283
4,n000001/0024_01.jpg,n000001,221,221


# the same for train

In [6]:
train_df = pd.read_csv('dataset/train.csv')
train_df.columns = ['img_path', 'identity', 'width', 'hight']
train_df.describe()

Unnamed: 0,width,hight
count,760495.0,760495.0
mean,245.854973,270.000676
std,29.340138,38.04495
min,200.0,131.0
25%,220.0,241.0
50%,244.0,266.0
75%,271.0,296.0
max,300.0,700.0


In [7]:
train_df[['identity']].groupby('identity').apply(
    lambda dfg: len(dfg)).describe()

count    8631.000000
mean       88.112038
std        29.792062
min        11.000000
25%        67.000000
50%        87.000000
75%       108.000000
max       220.000000
dtype: float64

# combine selected dataset

In [8]:
valid_counted_df = valid_df.groupby('identity')[['identity']].count()
valid_counted_df.columns = ['cnt']

valid_counted_df = valid_counted_df.loc[valid_counted_df['cnt'] >= MAX_IMGS_PER_IDNT]


valid_counted_df.head()

Unnamed: 0_level_0,cnt
identity,Unnamed: 1_level_1
n000001,108
n000029,95
n000078,136
n000082,73
n000106,65


In [9]:
valid_filtered_df = valid_df[valid_df.identity.isin(valid_counted_df.index)]
valid_filtered_df.describe()

Unnamed: 0,width,hight
count,38122.0,38122.0
mean,245.977651,271.321101
std,29.369289,38.109912
min,200.0,145.0
25%,220.0,242.0
50%,244.0,267.0
75%,271.0,297.0
max,300.0,680.0


# some utility functions

In [32]:
from random import choice
import random


def prepare_idnt_image_idxs(filtered_df):
    # preapre image registry
    image_registry = {}
    counter = 0
    for idx, row in filtered_df.iterrows():
        if counter > MAX_IMGS_PER_IDNT:
            counter = 0
            continue
        idnt = row['identity']
        if idnt not in image_registry:
            image_registry[idnt] = []
        image_registry[idnt].append(row['img_path'])
        counter = counter+1

    # index identities
    idnt_idxs = []
    for idnt in image_registry:
        idnt_idxs.append(idnt)
    return image_registry, idnt_idxs


def combine_pairs(image_registry, idnt_idxs):
    result_df = pd.DataFrame(columns=['source', 'target', 'similarity'])

    n_idnts = len(image_registry)
    counter = 0
    idnt_i = 0
    for idnt in image_registry:
        for pair_i in range(MAX_PAIRS_FOR_IDNT_PER_CLS):
            # genuine pairs
            source = image_registry[idnt][pair_i]
            target = image_registry[idnt][MAX_PAIRS_FOR_IDNT_PER_CLS-pair_i]
            result_df.loc[counter] = [source, target, 'genuine']
            counter = counter+1
            # imposter pairs
            imposter_idnt_idx = random.randint(0, n_idnts-1)
            while imposter_idnt_idx == idnt_i:
                imposter_idnt_idx = random.randint(0, n_idnts-1)

            imposter_idnt = idnt_idxs[imposter_idnt_idx]
            target = choice(image_registry[imposter_idnt])
            result_df.loc[counter] = [source, target, 'imposter']
            counter = counter+1
        idnt_i = idnt_i+1

    return result_df

# final valid df

In [17]:
valid_img_reg, valid_idnt_idx = prepare_idnt_image_idxs(valid_filtered_df)

In [18]:
valid_result_df = combine_pairs(valid_img_reg, valid_idnt_idx)

In [19]:
valid_result_df.describe()

Unnamed: 0,source,target,similarity
count,20750,20750,20750
unique,10375,16765,2
top,n003490/0022_01.jpg,n001935/0029_01.jpg,genuine
freq,2,5,10375


In [20]:
valid_result_df.to_csv('dataset/valid_g{}_i{}_pairs.csv'.format(MAX_PAIRS_FOR_IDNT_PER_CLS, MAX_PAIRS_FOR_IDNT_PER_CLS), index=False)

# final train df

In [21]:
train_counted_df = train_df.groupby('identity')[['identity']].count()
train_counted_df.columns = ['cnt']

train_counted_df = train_counted_df.loc[train_counted_df['cnt']
                                        >= MAX_IMGS_PER_IDNT]


train_counted_df.head()

Unnamed: 0_level_0,cnt
identity,Unnamed: 1_level_1
n000002,81
n000003,50
n000004,79
n000005,51
n000006,121


In [22]:
train_counted_df.describe()

Unnamed: 0,cnt
count,7841.0
mean,93.050376
std,26.534679
min,50.0
25%,72.0
50%,90.0
75%,110.0
max,220.0


In [24]:
train_filtered_df = train_df[train_df.identity.isin(train_counted_df.index)]
train_filtered_df.describe()

Unnamed: 0,width,hight
count,729608.0,729608.0
mean,245.9621,270.086611
std,29.346988,38.07449
min,200.0,131.0
25%,220.0,241.0
50%,244.0,266.0
75%,271.0,296.0
max,300.0,700.0


In [33]:
train_img_reg, train_idnt_idx = prepare_idnt_image_idxs(train_filtered_df)

In [35]:
train_result_df = combine_pairs(train_img_reg, train_idnt_idx)

KeyboardInterrupt: 

In [None]:
train_result_df.to_csv('dataset/train_g{}_i{}_pairs.csv'.format(MAX_PAIRS_FOR_IDNT_PER_CLS, MAX_PAIRS_FOR_IDNT_PER_CLS), index=False)