# Organising LFW Dataset into Male and Female 
*  https://www.dropbox.com/sh/l3ezp9qyy5hid80/AAAjK6HdDScd_1rXASlsmELla?dl=0 (manual labels for the gender)
* Process original LFW dataset into 2 folders (female, male) - each contains images of people from LFW.
* `.jpg` has been manually added to `Allison_Searing_0001` in `female_names.txt` 
* Duplicate names happen in both male and female text files, these names have been manually inspected, all found to be female, therefore removed from male list.
* "Tara_Kirk" has been manually added to the females name text file because it does not contain her initially.

In [130]:
import os
from os import listdir
from os.path import isfile, join
from distutils.dir_util import copy_tree # for copying whole folder to another directory
import shutil


* text file contains lines of images in `\[name]_\[0000].jpg` format
* will process into sets of unique identities by ignoring the `_0000.jpg` at the back 

In [131]:
def process_manual_labels(filename):
    with open(filename) as f:
        lines = [line[:-10] for line in f if len(line[:-10]) > 0] 
    
    lines = set(lines) # remove duplicates if any 

    return lines 

In [132]:
LFW_DIR = "data/LFW/"
NEW_LFW_DIR = 'data/LFW_gender/'
sub_folders = [name for name in os.listdir(LFW_DIR) if os.path.isdir(os.path.join(LFW_DIR, name))] # actual folders in LFW 

male_names = process_manual_labels("male_names.txt")
female_names = process_manual_labels("female_names.txt")

# handle duplicates (theyre all female actually)
female_names_in_male = female_names.intersection(male_names)
male_names = male_names - female_names_in_male

assert len(sub_folders) == len(male_names) + len(female_names), f"The number of people in actual LFW dataset is not the same as total people from the gender text files! {len(sub_folders)} != {len(male_names) + len(female_names)}"



In [133]:
LFW_DIR = "data/LFW/"
NEW_LFW_DIR = 'data/LFW_gender/'
sub_folders = [name for name in os.listdir(LFW_DIR) if os.path.isdir(os.path.join(LFW_DIR, name))]

print(len(sub_folders))

5749


In [134]:
def purge_lfw_gender():
    shutil.rmtree(os.path.join(NEW_LFW_DIR+"Female"))
    shutil.rmtree(os.path.join(NEW_LFW_DIR+"Male"))
    os.mkdir(os.path.join(NEW_LFW_DIR+"Female"))
    os.mkdir(os.path.join(NEW_LFW_DIR+"Male"))

In [135]:
def copy_to_new_labelled_folder(names, isMale=True):
    not_found = []
    found = [] 
    for name in names:
        person_folder_path = LFW_DIR + name
        
        # checking if folder of person is empty (dont exist)
        person_images = [f for f in listdir(person_folder_path) if isfile(join(person_folder_path, f))]
        if len(person_images) < 1:
            not_found.append(name) 
        
        else:
            # copy whole folder to new folder 
            gender_path = "Male/" if isMale else "Female/"
            new_path = NEW_LFW_DIR + gender_path + name
            os.mkdir(new_path)
            copy_tree(src=person_folder_path, dst=new_path)
            found.append(name)
    
    return found, not_found

In [136]:
purge_lfw_gender()
found, not_found = copy_to_new_labelled_folder(male_names) 

In [137]:
female_found, female_not_found = copy_to_new_labelled_folder(female_names, isMale=False)

In [138]:
print(not_found)
print(female_not_found)

[]
[]


In [139]:
# check if the newly copied folders in LFW_Gender are all there