# Deepface dataset CSV file generation

- Creates a csv file for indexing across datasets 

### Import relevant libraries 

In [9]:
import os
import pandas as pd
import random as rd
import ast
import shutil
from deepface_main_test_func import get_name

# Define dataset to generate CSV files: 

In [378]:
# Define dataset to perform csv creation: 
org_dataset_name = "LFW_gender"
dataset_for_csv = "LFW_gender"

# Define number of test instances for positive and negative tests 
# E.g. test_instances = 500, 
# positive and negative tests = 500 + 500 = 1000 instances
# Male + Female = 1000 + 1000 = 2000 instances  
test_instances = 1500 

## CSV file generation

In [379]:
# Test full auto
def check_if_image_in_other_dataset(other_datasets, org_dataset_name, image):
    check_lst = []
    for dataset in other_datasets:
        other_dataset_path = f"./data/deepface_{org_dataset_name}/{dataset}"
        if check_if_image_exist(other_dataset_path, image):
            check_lst.append(True)
    if len(check_lst) == len(other_datasets):
        return True
    else:
        return False


def check_if_image_exist(other_dataset_path, image):
    name = get_name(image)

    gender_lst = ["Male", "Female"]
    other_dataset_path_gender = f"{other_dataset_path}/{gender_lst[0]}"
    image_path = f"{other_dataset_path_gender}/{name}/{image}"

    if os.path.exists(image_path):
        return True
    else: 
        other_dataset_path_gender = f"{other_dataset_path}/{gender_lst[1]}"
        image_path = f"{other_dataset_path_gender}/{name}/{image}"
        return os.path.exists(image_path)

# generate pairs CSV file for the dataset
def generate_pairs_csv_full_auto(org_dataset_name, dataset_for_csv, test_instances, resample_pairs=True, seed_num=123):

    rd.seed(seed_num)
    
    dataset_dir = f"./data/{dataset_for_csv}"
    if not os.path.exists(dataset_dir):
        dataset_dir = f"./data/deepface_{org_dataset_name}/{dataset_for_csv}"


    # create csv file folder directory if not exist
    folder_dest_dir = f"./data/deepface_{org_dataset_name}_csv"
    if not os.path.exists(folder_dest_dir):
        os.makedirs(folder_dest_dir)

    # create csv file if not exist
    csv_dest_dir = f"{folder_dest_dir}/pairs_{dataset_for_csv}.csv"
    if not os.path.exists(csv_dest_dir) or resample_pairs:

        # copy original dataset with gender split into deepface_{dataset}
        copy_original_dataset_path = f"./data/deepface_{org_dataset_name}/{dataset_for_csv}"
        if not os.path.exists(copy_original_dataset_path):
            os.makedirs(copy_original_dataset_path)
            shutil.copytree(dataset_dir, copy_original_dataset_path, dirs_exist_ok=True)

        # create csv header 
        with open(csv_dest_dir, "w+") as f:
            f.write("test_case,imagenum1,imagenum2\n")

    other_datasets = [f"{org_dataset_name}_glasses", f"{org_dataset_name}_makeup_heavy", \
                        f"{org_dataset_name}_makeup_light", f"{org_dataset_name}_moustache"]

    # iterate through the identity labels by gender 
    for gender in ["Male", "Female"]: 
        
        # get all identity label images
        identity_labels = os.listdir(f"{dataset_dir}/{gender}")

        # write data into csv file 
        with open(csv_dest_dir, "a") as f:
            for positive_case in [True, False]:
                count = 0
                while count < test_instances: 
                    # get random image from the identity label
                    
                    ran_identity = rd.choice(identity_labels)
                    ran_identity_images = os.listdir(f"{dataset_dir}/{gender}/{ran_identity}")
                    usable_images_one = []

                    if positive_case: 
                        while len(usable_images_one) <= 1:
                            for random_image in ran_identity_images: 
                                if check_if_image_in_other_dataset(other_datasets, org_dataset_name, random_image):
                                    usable_images_one.append(random_image)
                            
                            if len(usable_images_one) > 1: 
                                break 

                            ran_identity = rd.choice(identity_labels)
                            ran_identity_images = os.listdir(f"{dataset_dir}/{gender}/{ran_identity}")
                            usable_images_one = []

                        ran_images = rd.sample(usable_images_one, 2)           
                        f.write(f"{positive_case},{ran_images[0]},{ran_images[1]}\n")
                        count += 1

                    else:
                        
                        while len(usable_images_one) < 1:
                            for random_image in ran_identity_images: 
                                if check_if_image_in_other_dataset(other_datasets, org_dataset_name, random_image):
                                    usable_images_one.append(random_image)
                            
                            if len(usable_images_one) >= 1: 
                                break 

                            ran_identity = rd.choice(identity_labels)
                            ran_identity_images = os.listdir(f"{dataset_dir}/{gender}/{ran_identity}")
                            usable_images_one = []

                        ran_identity2 = rd.choice(identity_labels)
                        ran_identity2_images = os.listdir(f"{dataset_dir}/{gender}/{ran_identity2}")
                        usable_images_two = []
                        while not (ran_identity2 == ran_identity) or len(usable_images_two) < 1:
                            for random_image in ran_identity2_images: 
                                if check_if_image_in_other_dataset(other_datasets, org_dataset_name, random_image):
                                    usable_images_two.append(random_image)
                            
                            if len(usable_images_two) >= 1: 
                                break 

                            ran_identity2 = rd.choice(identity_labels)
                            ran_identity2_images = os.listdir(f"{dataset_dir}/{gender}/{ran_identity2}")
                            usable_images_two = []

                        try: 
                            imagenum1 = rd.sample(usable_images_one, 1).pop()
                            imagenum2 = rd.sample(usable_images_two, 1).pop() 
                            f.write(f"{positive_case},{imagenum1},{imagenum2}\n")
                            count += 1
                        except Exception as e:
                            print(e)
                            return 

generate_pairs_csv_full_auto(org_dataset_name, dataset_for_csv, test_instances, seed_num=115)

### Check if image file exist in other datasets 

In [380]:
# dataset_name
generated_csv = f"pairs_{dataset_for_csv}.csv"
csv_dir = f"./data/deepface_{org_dataset_name}_csv/{generated_csv}"
dataset_csv = pd.read_csv(f"{csv_dir}")

other_datasets = [f"{org_dataset_name}_glasses", f"{org_dataset_name}_makeup_heavy", \
                    f"{org_dataset_name}_makeup_light", f"{org_dataset_name}_moustache"]

def check_if_image_exist(other_dataset_path, image):
    name = get_name(image)

    gender_lst = ["Male", "Female"]
    other_dataset_path_gender = f"{other_dataset_path}/{gender_lst[0]}"
    image_path = f"{other_dataset_path_gender}/{name}/{image}"

    if os.path.exists(image_path):
        return True
    else: 
        other_dataset_path_gender = f"{other_dataset_path}/{gender_lst[1]}"
        image_path = f"{other_dataset_path_gender}/{name}/{image}"
        return os.path.exists(image_path)

    
final_analysis = {}
for dataset in other_datasets:
    # result analysis of image existance in other datasets
    result = {}
    result["checked"] = []
    result["missing"] = []

    # check if image exists in other datasets
    iterations = 1
    for index, row in dataset_csv.iterrows():
        other_dataset_path = f"./data/deepface_{org_dataset_name}/{dataset}"

        if check_if_image_exist(other_dataset_path, row['imagenum1']):
            result["checked"].append(row['imagenum1'])
        else:
            result["missing"].append(row['imagenum1'])

        if check_if_image_exist(other_dataset_path, row['imagenum2']):
            result["checked"].append(row['imagenum2'])
        else:
            result["missing"].append(row['imagenum2'])

        iterations += 1
    final_analysis[f"{dataset}"] = result

In [381]:
for dataset in other_datasets:
    current_dataset = final_analysis[f"{dataset}"]
    checked_count = current_dataset["checked"]
    missing_count = current_dataset["missing"]
    print(f"{dataset}: \nchecked: {len(checked_count)}\nmissing: {len(missing_count)}\n")

LFW_gender_glasses: 
checked: 12000
missing: 0

LFW_gender_makeup_heavy: 
checked: 12000
missing: 0

LFW_gender_makeup_light: 
checked: 12000
missing: 0

LFW_gender_moustache: 
checked: 12000
missing: 0



In [None]:
dataset_csv = pd.read_csv(f"./data/deepface_{org_dataset_name}_csv/pairs_{dataset_for_csv}.csv")
male_pos, male_neg, female_pos, female_neg = [], [], [], []
count = 0
for index, row in dataset_csv.iterrows():
    name_img_a = get_name(row[1])
    if os.path.exists(f"./data/deepface_{org_dataset_name}/{dataset_for_csv}/Male/{name_img_a}/{row[1]}"):
        if row[0] == True:
            male_pos.append(row[1])
        else:
            male_neg.append(row[1])
    elif os.path.exists(f"./data/deepface_{org_dataset_name}/{dataset_for_csv}/Female/{name_img_a}/{row[1]}"):
        if row[0] == True:
            female_pos.append(row[1])
        else:
            female_neg.append(row[1])

In [None]:
print(f"Male pos: {len(male_pos)}\nMale neg: {len(male_neg)}\nFemale pos: {len(female_pos)}\nFemale neg: {len(female_neg)}")
print(f"Total: {len(male_pos)+len(male_neg)+len(female_pos)+len(female_neg)}")