In [1]:
import os
import csv
import random

In [2]:
def get_files(path):
    result = []
    images = {'0001' : [], '0002' : [], '0003' : [], '0004' : [], '0005' : [], '0006' : [], '0007' : [], '0008' : [], '0009' : [], '0010' : [],'0011' : [], '0012' : [], '0013' : [], '0014' : [], '0015' : [], '0016' : []}
    for root, dirs, files in os.walk(path):
        for file in files:
            if file.endswith('.bmp'):
                result.append({
                    'folder': os.path.basename(root),
                    'filename': file
                })
                images[os.path.basename(root)].append(file)

    return images

In [3]:
real_path = './NormalizedFace/ClientNormalized/'
fake_path = './NormalizedFace/ImposterNormalized/'

real = get_files(real_path)
fake = get_files(fake_path)

print('Real Faces\nClient #Images')
for folder in real.keys():
    print(folder, len(real[folder]))

print('\n\nFake Faces\nClient #Images')
for folder in fake.keys():
    print(folder, len(fake[folder]))

Real Faces
Client #Images
0001 249
0002 57
0003 113
0004 681
0005 190
0006 730
0007 762
0008 123
0009 213
0010 76
0011 409
0012 435
0013 472
0014 477
0015 118
0016 0


Fake Faces
Client #Images
0001 614
0002 609
0003 603
0004 608
0005 595
0006 458
0007 605
0008 599
0009 602
0010 242
0011 303
0012 384
0013 0
0014 439
0015 380
0016 468


In [4]:
def make_folder_dataset(list1, list2, count, folder_name, path1, path2, label):
    n = int(count ** 0.5 + 1)
    l1 = random.sample(list1, n)
    l2 = random.sample(list2, n)
    result = []

    for i in range(len(l1)):
        for j in range(len(l2)):
            result.append({
                'img1' : path1 + folder_name + '/' + l1[i],
                'img2' : path2 + folder_name + '/' + l2[j],
                'label' : label
            }) 

    return result

In [5]:
def generate_dataset(real, fake, entryPerFace, folders):
    dataset = []

    for folder in folders:
        result = make_folder_dataset(real[folder], real[folder], entryPerFace, folder, real_path, real_path, 1)
        for i in range(len(result)):
            dataset.append(result[i])

    for folder in folders:
        result = make_folder_dataset(real[folder], fake[folder], entryPerFace, folder, real_path, fake_path, 0)
        for i in range(len(result)):
            dataset.append(result[i])
    
    random.shuffle(dataset)
    return dataset

In [6]:
folders = ['0001', '0002', '0003', '0004', '0005', '0006', '0007', '0008', '0009', '0010', '0011', '0012', '0014', '0015']

train_dataset = generate_dataset(real, fake, 400, folders[:8])
valid_dataset = generate_dataset(real, fake, 250, folders[8:10])
test_dataset = generate_dataset(real, fake, 250, folders[10:])

print("Length of train dataset =", len(train_dataset))
print("Length of validation dataset =", len(valid_dataset))
print("Length of test dataset =", len(test_dataset))

Length of train dataset = 7056
Length of validation dataset = 1024
Length of test dataset = 2048


In [7]:
def write_file(file_name, dataset):
    random.shuffle(dataset)
    with open(file_name, 'w', newline='') as output_file:
        dict_writer = csv.DictWriter(output_file, dataset[0].keys())
        dict_writer.writeheader()
        dict_writer.writerows(dataset)

In [8]:
write_file('train.csv', train_dataset)
write_file('valid.csv', valid_dataset)
write_file('test.csv', test_dataset)