In [1]:
import os
import numpy as np
import pandas as pd
from random import sample
from glob import glob

In [2]:
BASE_PATH = "/root/KinshipRecognition"
TRAIN_FILE = f"{BASE_PATH}/data/train_ds.csv"
TRAIN_FOLDERS = f"{BASE_PATH}/data/train/train-faces"
MAX_SAMPLES_PER_PAIR = 10

In [3]:
# Load training data index
train_ds = np.asarray(pd.read_csv(TRAIN_FILE))[:, 1:]
# Sort by label
ds_by_label = dict()
for label in np.unique(train_ds[:, -1]):
    ds_by_label[label] = train_ds[np.where(train_ds[:, -1] == label)]

In [4]:
# Augments existing kin and non-kin samples
def augment_train_ds(ds_by_label, exclude_labels=None):
    for label in ds_by_label.keys():
        if exclude_labels and label not in exclude_labels:
            sample_list = list()
            ds_samples = ds_by_label[label]
            for tup in ds_samples:
                mid_1 = os.listdir(f"{TRAIN_FOLDERS}/{'/'.join(tup[0].split('/')[:2])}")
                mid_1.remove(tup[0].split('/')[-1])
                mid_1 = [f"{'/'.join(tup[0].split('/')[:2])}/{f}" for f in mid_1]
                mid_2 = os.listdir(f"{TRAIN_FOLDERS}/{'/'.join(tup[1].split('/')[:2])}")
                mid_2.remove(tup[1].split('/')[-1])
                mid_2 = [f"{'/'.join(tup[1].split('/')[:2])}/{f}" for f in mid_2]
                combos = np.stack(np.meshgrid(mid_1, mid_2), -1).reshape(-1, 2)
                combos = np.c_[combos.astype(object), np.ones([len(combos), 1]).astype(int) * label]
                if len(combos) > MAX_SAMPLES_PER_PAIR:
                    samples = list(combos[np.random.randint(0, len(combos), MAX_SAMPLES_PER_PAIR)])
                    [sample_list.append(s) for s in samples]
                else:
                    samples = list(combos)
                    [sample_list.append(s) for s in samples]
            aug_ds_samples = np.r_[ds_samples, np.asarray(sample_list)]        
            ds_by_label[label] = aug_ds_samples
    return ds_by_label

In [5]:
# Generates non-kin samples
def expand_train_ds(n_expand_samples, label=0):
    families = os.listdir(TRAIN_FOLDERS)
    sample_list = list()
    while len(sample_list) < n_expand_samples:
        fam_1 = np.random.choice(families)
        fam_2 = np.random.choice(families)
        if fam_1 != fam_2:
            fam_1_list = ['/'.join(x.split('/')[-3:]) for x in glob(f"{TRAIN_FOLDERS}/{fam_1}/*/*")]
            fam_2_list = ['/'.join(x.split('/')[-3:]) for x in glob(f"{TRAIN_FOLDERS}/{fam_2}/*/*")]
            combos = np.stack(np.meshgrid(fam_1_list, fam_2_list), -1).reshape(-1, 2)
            combos = np.c_[combos.astype(object), np.ones([len(combos), 1]).astype(int) * label]
            if len(combos) > MAX_SAMPLES_PER_PAIR:
                samples = list(combos[np.random.randint(0, len(combos), MAX_SAMPLES_PER_PAIR)])
                [sample_list.append(s) for s in samples]
            else:
                samples = list(combos)
                [sample_list.append(s) for s in samples]
    return np.asarray(sample_list)

In [6]:
# Augment existing dataset.
aug_ds_by_label = augment_train_ds(ds_by_label, exclude_labels=[0])

In [7]:
# Ratio of 0-samples to 1-samples
ZEROS_RATIO = 1
scale = (len(aug_ds_by_label[1]) / len(aug_ds_by_label[0]) * ZEROS_RATIO) - 1
if scale >= 1:
    # Expand dataset by adding 0-labelled samples
    n_expand_samples = int(scale * len(aug_ds_by_label[0]))
    ds_zeros = expand_train_ds(n_expand_samples, label=0)
    # Append expanded samples to augmented samples
    aug_ds_by_label[0] = np.r_[aug_ds_by_label[0], ds_zeros]
else:
    print("Ratio requires truncating the dataset. This function only augments the dataset.")

In [8]:
# Configure dataframe and save to .csv
aug_ds = np.r_[aug_ds_by_label[0], aug_ds_by_label[1]]
aug_ds = pd.DataFrame(aug_ds)
aug_ds.columns = ['p1', 'p2', 'relationship']
aug_ds.to_csv(f"{BASE_PATH}/data/aug_train_ds.csv")