In [21]:
from itertools import combinations
import os
import pandas as pd
import random
import numpy as np


# LFW Dataset:
# 13233 Images, 5749 people, 1680 with two or more images.

# Randomly selecting images for the training and test dataset
# Labels: (img1, img2, label)
# label: 0 if not the same person
# label: 1 if the same person

# Typically runs for 2.5 to 5 mins.
# A little intensive on memory.

# Num of combinations = 13233 * 13232
# Dataset is skewed, ie. some people have a lot of images and others may have a single image.
# For instance, George Bush have above 450 images in the dataset.
# In order to prevent biasing, I put a condition on the selection process.

# Step 1: Make combinations
# Step 2: Shuffle the combination
# Step 3: For label 0, select at most 10 different pairs from a single person.
#         For label 1, image of a particular person can be used at most in 10 different pairs.
# Step 4: Divide the label 0 and label 1, as a result:
#         Train set: 12k images, with 6k label(0) and 6k label(1)
#          Test set:  2k images, with 1k label(0) and 1k label(1)
# Step 5: Shuffle both sets and save as a csv file.

path = f"/Users/necatiisik/lfw_dataset/lfw/"

name_list = []
for root, dirs, files in os.walk(path, topdown=False):
    for name in sorted(files):
        name_list.append(name)

name_list.remove(".DS_Store")

slicer = slice(-9)

name_combination = combinations(name_list, 2)
shuffle_list = list(name_combination)
random.shuffle(shuffle_list)

my_list_pos = []
my_list_neg = []

counter_pos = 0
counter_neg = 0

duplicate_list_pos = []
duplicate_list_neg = []

for items in shuffle_list:
    if items[0][slicer] == items[1][slicer]:
        if counter_pos < 7000:
            if duplicate_list_pos.count(items[0][slicer]) < 10:
                counter_pos += 1
                my_list_pos.append(items)
                duplicate_list_pos.append(items[0][slicer])
    elif counter_neg < 7000:
        if duplicate_list_neg.count(items[0][slicer]) < 10 and duplicate_list_neg.count(items[0][slicer]) < 10:
            counter_neg += 1
            my_list_neg.append(items)
            duplicate_list_neg.append(items[0][slicer])
            duplicate_list_neg.append(items[1][slicer])

    if counter_pos == 100:
        print(f"Current counters: Pos:{counter_pos} Neg:{counter_neg}")
    if counter_pos == 1000:
        print(f"Current counters: Pos:{counter_pos} Neg:{counter_neg}")
    if counter_pos == 3000:
        print(f"Current counters: Pos:{counter_pos} Neg:{counter_neg}")
    if counter_pos == 5000:
        print(f"Current counters: Pos:{counter_pos} Neg:{counter_neg}")

    if counter_pos == 7000 and counter_neg == 7000:
        break

print(f"Length of pos list: {len(my_list_pos)}")
print(f"Length of neg list: {len(my_list_neg)}")

label_list_tot = []
label_list_pos = []
label_list_neg = []

for items in my_list_pos:
    label_list_pos.append((items[0], items[1], 1.0))

for items in my_list_neg:
    label_list_neg.append((items[0], items[1], 0.0))

label_list_tot = label_list_pos + label_list_neg

test_pos = random.sample(label_list_pos, 1000)
test_neg = random.sample(label_list_neg, 1000)
test_tot = test_pos + test_neg

train_tot = [elements for elements in label_list_tot if elements not in test_tot]

print(f"Length of train list: {len(train_tot)}")
print(f"Length of test list: {len(test_tot)}")


random.shuffle(train_tot)
random.shuffle(test_tot)

csv_file = pd.DataFrame(train_tot)
csv_file.to_csv("14k_train.txt", header=None, index=None)

csv_file = pd.DataFrame(test_tot)
csv_file.to_csv("2k_test.txt", header=None, index=None)


Current counters: Pos:100 Neg:7000
Current counters: Pos:100 Neg:7000
Current counters: Pos:100 Neg:7000
Current counters: Pos:100 Neg:7000
Current counters: Pos:100 Neg:7000
Current counters: Pos:100 Neg:7000
Current counters: Pos:100 Neg:7000
Current counters: Pos:100 Neg:7000
Current counters: Pos:100 Neg:7000
Current counters: Pos:100 Neg:7000
Current counters: Pos:100 Neg:7000
Current counters: Pos:100 Neg:7000
Current counters: Pos:100 Neg:7000
Current counters: Pos:100 Neg:7000
Current counters: Pos:100 Neg:7000
Current counters: Pos:100 Neg:7000
Current counters: Pos:100 Neg:7000
Current counters: Pos:100 Neg:7000
Current counters: Pos:100 Neg:7000
Current counters: Pos:100 Neg:7000
Current counters: Pos:100 Neg:7000
Current counters: Pos:100 Neg:7000
Current counters: Pos:100 Neg:7000
Current counters: Pos:100 Neg:7000
Current counters: Pos:100 Neg:7000
Current counters: Pos:100 Neg:7000
Current counters: Pos:100 Neg:7000
Current counters: Pos:100 Neg:7000
Current counters: Po

In [17]:
test_list = ["George", "George", "George"]

print(test_list.count("George"))
print(type(test_list.count("George")))

if test_list.count("George") < 5 and test_list.count("George") < 5:
    print("yeay")



3
<class 'int'>
yeay
