## 10-fold cross-validation sampled from  LFW perturbed data  

### Import relevant libraries 

In [1]:
import os
import pandas as pd
import random as rd
import ast

# Define dataset to generate CSV files: 

In [5]:
# Define dataset to perform csv creation: 
dataset_name = "LFW_gender"

# Define number of test instances for positive and negative tests 
# E.g. test_instances = 500, positive and negative tests = 500 + 500 = 1000 instances 
test_instances = 500 

## CSV file generation

In [109]:
# generate pairs CSV file for the dataset
def generate_pairs_csv(dataset_name, test_instances, resample_pairs):
    dataset_dir = f"../data/{dataset_name}"
    # create csv file folder directory if not exist
    folder_dest_dir = f"../data/deepface_{dataset_name}_csv"
    if not os.path.exists(folder_dest_dir):
        os.makedirs(folder_dest_dir)

    # create csv file if not exist
    csv_dest_dir = f"{folder_dest_dir}/pairs_{dataset_name}_gender.csv"
    if not os.path.exists(csv_dest_dir) or resample_pairs:
        with open(csv_dest_dir, "w+") as f:
            f.write("test_case,imagenum1,imagenum2\n")

    # iterate through the identity labels by gender 
    for gender in ["Male", "Female"]: 
        
        # get all identity label images
        identity_labels = os.listdir(f"{dataset_dir}/{gender}")

        # write data into csv file 
        with open(csv_dest_dir, "a") as f:
            for positive_case in [True, False]:
                count = 0
                while count < test_instances: 
                    # get random image from the identity label
                    ran_identity = rd.choice(identity_labels)

                    if positive_case: 
                        if len(os.listdir(f"{dataset_dir}/{gender}/{ran_identity}")) > 1: 
                            ran_images = rd.sample(os.listdir(f"{dataset_dir}/{gender}/{ran_identity}"), 2)
                            f.write(f"{positive_case},{ran_images[0]},{ran_images[1]}\n")
                            count += 1

                    else:
                        ran_identity2 = rd.choice(identity_labels)
                        while ran_identity2 == ran_identity:
                            ran_identity2 = rd.choice(identity_labels)

                        imagenum1 = rd.sample(os.listdir(f"{dataset_dir}/{gender}/{ran_identity}"), 1).pop()
                        imagenum2 = rd.sample(os.listdir(f"{dataset_dir}/{gender}/{ran_identity2}"), 1).pop()
                        f.write(f"{positive_case},{imagenum1},{imagenum2}\n")
                        count += 1

In [113]:
generate_pairs_csv(dataset_name, test_instances, resample_pairs=False)

# Read Perturbed LFW data:

In [117]:
# PERTURBED DATA: 
# Read male
full_perturbed_male_names = pd.read_csv('../DeepFace-gender-bias-detection/data/LFW-csv/full_perturbed_male.csv')

# Read female
full_perturbed_female_names = pd.read_csv('../DeepFace-gender-bias-detection/data/LFW-csv/full_perturbed_female.csv')

# Read final benchmark csv
benchmark_deepface_df = pd.read_csv('../DeepFace-gender-bias-detection/data/LFW-csv/pairs_benchmark_deepface.csv')

In [115]:
def sample_pair_images(perturbed_gender_names, old_df, test_case, output_file):
    
    # loop thru the perturbed names
    for index, row in perturbed_gender_names.iterrows():

        # convert string list to list
        images = ast.literal_eval(row[1]).pop()

        # Positive case: Match 2 images from the same person 
        if test_case and len(images) > 1:
            # random sample 2 images
            rd.seed(1)
            samples = rd.sample(images, 2)

            # assign images 
            imagenum1 = samples[0]
            imagenum2 = samples[1]

        # Negative case: Match 2 images from different persons 
        elif not test_case:
            # random sample 1 images
            rd.seed(1)
            imagenum1 = rd.sample(images, 1).pop()
            imagenum2 = imagenum1

            # random sample another image
            while imagenum2 == imagenum1:
                sample = rd.randint(0, len(full_perturbed_male_names))
                sample_images = ast.literal_eval(full_perturbed_male_names.iloc[sample , 1]).pop()
                imagenum2 = rd.sample(sample_images, 1).pop()
        else:
            continue
        
        # store into dictionary and convert to dataframe
        perturbed_df = {"test_case": test_case, "imagenum1": imagenum1, "imagenum2": imagenum2}
        new_df = pd.DataFrame([perturbed_df])

        # merge both current csv + new perturbed data
        frames = [old_df, new_df]
        new_merge_df = pd.concat(frames)

        # reset index
        new_merge_df = new_merge_df.reset_index()
        new_merge_df.drop(columns=['index'], inplace=True)
        
        # Write results to csv file
        new_merge_df.to_csv(f"{output_file}", index=False)
        old_df = pd.read_csv(f"{output_file}")  

        # Exit on 500 images added:
        if len(old_df) % 500 == 0:
            break

In [118]:
# Sample 500 positive matches  
# sample_pair_images(full_perturbed_male_names, benchmark_deepface_df, True, '../DeepFace-gender-bias-detection/data/LFW-csv/pairs_benchmark_deepface.csv')
# sample_pair_images(full_perturbed_female_names, benchmark_deepface_df, True, '../DeepFace-gender-bias-detection/data/LFW-csv/pairs_benc hmark_deepface.csv')


# Sample 500 negative matches 
# sample_pair_images(full_perturbed_male_names, benchmark_deepface_df, False, '../DeepFace-gender-bias-detection/data/LFW-csv/pairs_benchmark_deepface.csv')
# sample_pair_images(full_perturbed_female_names, benchmark_deepface_df, False, '../DeepFace-gender-bias-detection/data/LFW-csv/pairs_benchmark_deepface.csv')