# Organising LFW Dataset into Male and Female 
*  https://www.dropbox.com/sh/l3ezp9qyy5hid80/AAAjK6HdDScd_1rXASlsmELla?dl=0 (manual labels for the gender)
* Process original LFW dataset into 2 folders (female, male) - each contains images of people from LFW.
* `.jpg` has been manually added to `Allison_Searing_0001` in `female_names.txt` 
* Duplicate names happen in both male and female text files, these names have been manually inspected, all found to be female, therefore removed from male list.
* "Tara_Kirk" has been manually added to the females name text file because it does not contain her initially.

In [21]:
import os
from os import listdir
from os.path import isfile, join
from distutils.dir_util import copy_tree # for copying whole folder to another directory
import shutil


* text file contains lines of images in `\[name]_\[0000].jpg` format
* will process into sets of unique identities by ignoring the `_0000.jpg` at the back 

In [22]:
def process_manual_labels(filename):
    with open(filename) as f:
        lines = [line[:-10] for line in f if len(line[:-10]) > 0] 
    
    lines = set(lines) # remove duplicates if any 

    return lines 

In [23]:
LFW_DIR = "data/LFW/"
NEW_LFW_DIR = 'data/LFW_gender/'
sub_folders = [name for name in os.listdir(LFW_DIR) if os.path.isdir(os.path.join(LFW_DIR, name))] # actual folders in LFW 

male_names = process_manual_labels("male_names.txt")
female_names = process_manual_labels("female_names.txt")

# handle duplicates (theyre all female actually)
female_names_in_male = female_names.intersection(male_names)
male_names = male_names - female_names_in_male

assert len(sub_folders) == len(male_names) + len(female_names), f"The number of people in actual LFW dataset is not the same as total people from the gender text files! {len(sub_folders)} != {len(male_names) + len(female_names)}"



In [24]:
LFW_DIR = "data/LFW/"
NEW_LFW_DIR = 'data/LFW_gender/'
sub_folders = [name for name in os.listdir(LFW_DIR) if os.path.isdir(os.path.join(LFW_DIR, name))]

print(len(sub_folders))

5749


In [53]:
def purge_lfw_gender():
    shutil.rmtree(os.path.join(NEW_LFW_DIR+"Female"))
    shutil.rmtree(os.path.join(NEW_LFW_DIR+"Male"))
    shutil.rmtree(os.path.join(NEW_LFW_DIR+"Male_BW"))

    os.mkdir(os.path.join(NEW_LFW_DIR+"Female"))
    os.mkdir(os.path.join(NEW_LFW_DIR+"Male"))
    os.mkdir(os.path.join(NEW_LFW_DIR+"Male_BW"))

In [49]:
def copy_to_new_labelled_folder(names, isMale=True, folder_name=None):
    not_found = []
    found = [] 
    for name in names:
        print(name)
        person_folder_path = LFW_DIR + name
        
        # checking if folder of person is empty (dont exist)
        person_images = [f for f in listdir(person_folder_path) if isfile(join(person_folder_path, f))]
        if len(person_images) < 1:
            not_found.append(name) 
        
        else:
            # copy whole folder to new folder 
            gender_path = "Male/" if isMale else "Female/"

            if folder_name is None:
                folder_name = gender_path
            new_path = NEW_LFW_DIR + folder_name + name
            os.mkdir(new_path)
            copy_tree(src=person_folder_path, dst=new_path)
            found.append(name)
    
    return found, not_found

In [27]:
purge_lfw_gender()
found, not_found = copy_to_new_labelled_folder(male_names) 

In [28]:
female_found, female_not_found = copy_to_new_labelled_folder(female_names, isMale=False)

In [29]:
print(not_found)
print(female_not_found)

[]
[]


### Copy male pictures from original LFW dataset
* txt file containing 1511 random males name in `male_name_bm.txt` 

In [12]:
def get_names_from_txt_file(filename):
    with open(filename) as f:
        lines = [line.rstrip() for line in f] # remove newline using rstrip
    
    lines = set(lines) # remove duplicates if any 

    return lines 

In [52]:
random_male_names = get_names_from_txt_file("male_name_bm.txt")

print(len(random_male_names))

# import collections
# print([item for item, count in collections.Counter(random_male_names).items() if count > 1])

rm_found, rm_not_found = copy_to_new_labelled_folder(random_male_names,isMale=True,folder_name='Male_BM/')

1121
Sylvester_Stallone
Brandon_Webb
Luc_Montagnier
Edward_Seaga
Robert_Zoellick
Dan_Morales
Harry_Kalas
Aaron_Sorkin
Gregorio_Honasan
Roberto_Carlos
Eric_Clapton
Bill_Frist
Takashi_Sorimachi
Ed_Smart
Larry_Johnson
Jefferson_Perez
David_Duval
Sean_Patrick_OMalley
Alvaro_Silva_Calderon
Franco_Dragone
Robert_Ehrlich
Conan_OBrien
Jerry_Oliver
John_Snow
Allan_Wagner
Billy_Crawford
Bryan_Thomas
Phan_Van_Khai
Boris_Henry
Bill_Readdy
Jason_Lezak
Roy_Williams
Clifford_Robinson
Roman_Tam
Gary_Sinise
John_Kerry
Nicolas_Lapentti
Adrian_Fernandez
Ben_Kingsley
Bill_OReilly
Lucio_Gutierrez
Derek_Jeter
Abel_Pacheco
Vincent_Brooks
Rob_Marshall
Joey_Harrington
Raaf_Schefter
Kurt_Busch
Mikhail_Kasyanov
Dany_Heatley
Alex_Popov
Harland_Braun
Paul_Pierce
Brandon_Fails
Thaksin_Shinawatra
Rio_Ferdinand
Jose_Maria_Aznar
Donald_Fehr
Krishna_Bhadur_Mahara
Roger_Clemens
Lord_Hutton
Mohammed_Baqir_al-Hakim
Jonathan_Tiomkin
Lee_Soo-hyuck
Gonzalo_Sanchez_de_Lozada
Christian_Lirette
Dudley_Rogers
Rick_Romley
Jose_Ma

In [55]:
print(rm_not_found)

[]
