# Deepface dataset CSV file generation

- Creates a csv file for indexing across datasets 

### Import relevant libraries 

In [5]:
import os
import pandas as pd
import random as rd
import ast
import shutil

# Define dataset to generate CSV files: 

In [2]:
# Define dataset to perform csv creation: 
dataset_name = "LFW_gender"

# Define number of test instances for positive and negative tests 
# E.g. test_instances = 500, 
# positive and negative tests = 500 + 500 = 1000 instances
# Male + Female = 1000 + 1000 = 2000 instances  
test_instances = 500 

## CSV file generation

In [15]:
# generate pairs CSV file for the dataset
def generate_pairs_csv(dataset_name, test_instances, resample_pairs=True):
    dataset_dir = f"../data/{dataset_name}"
    # create csv file folder directory if not exist
    folder_dest_dir = f"../data/deepface_{dataset_name}_csv"
    if not os.path.exists(folder_dest_dir):
        os.makedirs(folder_dest_dir)

    # create csv file if not exist
    csv_dest_dir = f"{folder_dest_dir}/pairs_{dataset_name}.csv"
    if not os.path.exists(csv_dest_dir) or resample_pairs:

        # copy original dataset with gender split into deepface_data 
        copy_original_dataset_path = f"../data/deepface_{dataset_name}/{dataset_name}"
        if not os.path.exists(copy_original_dataset_path):
            os.makedirs(copy_original_dataset_path)
            shutil.copytree(dataset_dir, copy_original_dataset_path, dirs_exist_ok=True)

        # create csv header 
        with open(csv_dest_dir, "w+") as f:
            f.write("test_case,imagenum1,imagenum2\n")

    # iterate through the identity labels by gender 
    for gender in ["Male", "Female"]: 
        
        # get all identity label images
        identity_labels = os.listdir(f"{dataset_dir}/{gender}")

        # write data into csv file 
        with open(csv_dest_dir, "a") as f:
            for positive_case in [True, False]:
                count = 0
                while count < test_instances: 
                    # get random image from the identity label
                    ran_identity = rd.choice(identity_labels)

                    if positive_case: 
                        if len(os.listdir(f"{dataset_dir}/{gender}/{ran_identity}")) > 1: 
                            ran_images = rd.sample(os.listdir(f"{dataset_dir}/{gender}/{ran_identity}"), 2)
                            f.write(f"{positive_case},{ran_images[0]},{ran_images[1]}\n")
                            count += 1

                    else:
                        ran_identity2 = rd.choice(identity_labels)
                        while ran_identity2 == ran_identity:
                            ran_identity2 = rd.choice(identity_labels)

                        imagenum1 = rd.sample(os.listdir(f"{dataset_dir}/{gender}/{ran_identity}"), 1).pop()
                        imagenum2 = rd.sample(os.listdir(f"{dataset_dir}/{gender}/{ran_identity2}"), 1).pop()
                        f.write(f"{positive_case},{imagenum1},{imagenum2}\n")
                        count += 1

In [16]:
generate_pairs_csv(dataset_name, test_instances)