In [81]:
from sklearn.datasets import fetch_lfw_people
# Load the LFW dataset in color with original image size
lfw_people = fetch_lfw_people(color=True, resize=None, min_faces_per_person=20, funneled=True, download_if_missing=True)

# Print the shape of the data
print(lfw_people.data.shape)

(3023, 35250)


In [73]:
data = lfw_people.data
target = lfw_people.target

In [74]:
data.shape

(3023, 35250)

In [75]:
images = lfw_people.images

In [76]:
images.shape

(3023, 125, 94, 3)

In [7]:
target.shape

(3023,)

In [8]:
lfw_people.target_names.shape

(62,)

In [11]:
import numpy as np

In [14]:
X, y = lfw_people.images, lfw_people.target

# Get unique classes (person IDs)
unique_classes = np.unique(y)

# Create a dictionary to hold separated classes
class_images = {class_id: [] for class_id in unique_classes}

# Separate images into respective classes
for idx, label in enumerate(y):
    class_images[label].append(X[idx])

# Convert lists to numpy arrays
for class_id in class_images:
    class_images[class_id] = np.array(class_images[class_id])

# Print the number of images per class
# for class_id, images in class_images.items():
#     print(f"Class {class_id}, {lfw_people.target_names[class_id]}: {images.shape[0]} images")


In [15]:
class_images_sorted = {class_id: images for class_id, images in sorted(class_images.items(), key=lambda x: len(x[1]), reverse=True)}

for class_id, images in class_images_sorted.items():
    print(f"Class {class_id}, {lfw_people.target_names[class_id]}: {images.shape[0]} images")

Class 14, George W Bush: 530 images
Class 10, Colin Powell: 236 images
Class 58, Tony Blair: 144 images
Class 12, Donald Rumsfeld: 121 images
Class 15, Gerhard Schroeder: 109 images
Class 5, Ariel Sharon: 77 images
Class 21, Hugo Chavez: 71 images
Class 35, Junichiro Koizumi: 60 images
Class 25, Jean Chretien: 55 images
Class 31, John Ashcroft: 53 images
Class 24, Jacques Chirac: 52 images
Class 53, Serena Williams: 52 images
Class 60, Vladimir Putin: 49 images
Class 40, Luiz Inacio Lula da Silva: 48 images
Class 16, Gloria Macapagal Arroyo: 44 images
Class 6, Arnold Schwarzenegger: 42 images
Class 27, Jennifer Capriati: 42 images
Class 37, Laura Bush: 41 images
Class 39, Lleyton Hewitt: 41 images
Class 0, Alejandro Toledo: 39 images
Class 20, Hans Blix: 39 images
Class 45, Nestor Kirchner: 37 images
Class 3, Andre Agassi: 36 images
Class 1, Alvaro Uribe: 35 images
Class 42, Megawati Sukarnoputri: 33 images
Class 54, Silvio Berlusconi: 33 images
Class 57, Tom Ridge: 33 images
Class 36,

In [82]:
import albumentations as A
from sklearn.utils import shuffle
import numpy as np

# Define Albumentations transformations
transform = A.Compose([
    # A.RandomRotate90(),
    A.HorizontalFlip(),
    A.VerticalFlip(),
    A.RandomBrightnessContrast(),
    A.RandomGamma(),
    A.RandomContrast(limit=0.2),  # Randomly change contrast of the input image
    A.RandomBrightness(limit=0.2),  # Randomly change brightness of the input image
    A.RandomShadow(),  # Simulate shadows on the image
    A.RandomSunFlare(),  # Simulate sun flare effect
    A.RGBShift(),  # Randomly shift the RGB channels of the input image
    A.RandomRain(),  # Simulate raindrops on the image
    A.Blur(blur_limit=3),  # Apply blur effect to the image
    A.MotionBlur(blur_limit=3),  # Apply motion blur effect to the image
    A.GaussNoise(var_limit=(10, 50)),  # Apply Gaussian noise to the image
    A.RandomSnow(),  # Simulate snow on the image
    A.RandomFog(),  # Simulate fog on the image
    A.OpticalDistortion(),  # Apply optical distortion to the image
    A.ElasticTransform(),  # Apply elastic transformation to the image
    A.GridDistortion(),  # Apply grid distortion to the image
    A.HueSaturationValue(),  # Randomly change hue, saturation, and value of the input image
])


# Set the desired number of images per class
desired_num_images = 200

# Create a dictionary to hold augmented images
augmented_class_images = {}

# Loop through sorted_class_images
for class_id, images in class_images_sorted.items():
    num_images = images.shape[0]
    if num_images >= desired_num_images:
        # If the class has more than or equal to 200 images, randomly select 200 images
        selected_indices = np.random.choice(num_images, size=desired_num_images, replace=False)
        selected_images = images[selected_indices]
        augmented_class_images[class_id] = selected_images
    else:
        # If the class has fewer than 200 images, augment the existing images to reach 200
        num_to_generate = desired_num_images - num_images
        # Perform data augmentation
        augmented_images = []
        for i in range(num_to_generate):
            # Apply Albumentations transformations
            augmented = transform(image=images[i % num_images])
            augmented_image = augmented["image"]
            augmented_images.append(augmented_image)
        # Concatenate the original images and augmented images
        # print(augmented_images[1].shape)
        # print(augmented_images[2].shape)
        # print(augmented_images[3].shape)
        # print(images.shape)
        augmented_images = np.array(augmented_images)
        augmented_class_images[class_id] = np.concatenate((images, augmented_images), axis=0)

# Now augmented_class_images dictionary contains all images separated by their respective classes, 
# with each class having 200 images


(125, 94, 3)
(125, 94, 3)
(125, 94, 3)
(144, 125, 94, 3)
(125, 94, 3)
(125, 94, 3)
(125, 94, 3)
(121, 125, 94, 3)
(125, 94, 3)
(125, 94, 3)
(125, 94, 3)
(109, 125, 94, 3)
(125, 94, 3)
(125, 94, 3)
(125, 94, 3)
(77, 125, 94, 3)
(125, 94, 3)
(125, 94, 3)
(125, 94, 3)
(71, 125, 94, 3)
(125, 94, 3)
(125, 94, 3)
(125, 94, 3)
(60, 125, 94, 3)
(125, 94, 3)
(125, 94, 3)
(125, 94, 3)
(55, 125, 94, 3)
(125, 94, 3)
(125, 94, 3)
(125, 94, 3)
(53, 125, 94, 3)
(125, 94, 3)
(125, 94, 3)
(125, 94, 3)
(52, 125, 94, 3)
(125, 94, 3)
(125, 94, 3)
(125, 94, 3)
(52, 125, 94, 3)
(125, 94, 3)
(125, 94, 3)
(125, 94, 3)
(49, 125, 94, 3)
(125, 94, 3)
(125, 94, 3)
(125, 94, 3)
(48, 125, 94, 3)
(125, 94, 3)
(125, 94, 3)
(125, 94, 3)
(44, 125, 94, 3)
(125, 94, 3)
(125, 94, 3)
(125, 94, 3)
(42, 125, 94, 3)
(125, 94, 3)
(125, 94, 3)
(125, 94, 3)
(42, 125, 94, 3)
(125, 94, 3)
(125, 94, 3)
(125, 94, 3)
(41, 125, 94, 3)
(125, 94, 3)
(125, 94, 3)
(125, 94, 3)
(41, 125, 94, 3)
(125, 94, 3)
(125, 94, 3)
(125, 94, 3)
(39, 1

In [83]:
for class_id, images in augmented_class_images.items():
    print(f"Class {class_id}, {lfw_people.target_names[class_id]}: {images.shape[0]} images")

Class 14, George W Bush: 200 images
Class 10, Colin Powell: 200 images
Class 58, Tony Blair: 200 images
Class 12, Donald Rumsfeld: 200 images
Class 15, Gerhard Schroeder: 200 images
Class 5, Ariel Sharon: 200 images
Class 21, Hugo Chavez: 200 images
Class 35, Junichiro Koizumi: 200 images
Class 25, Jean Chretien: 200 images
Class 31, John Ashcroft: 200 images
Class 24, Jacques Chirac: 200 images
Class 53, Serena Williams: 200 images
Class 60, Vladimir Putin: 200 images
Class 40, Luiz Inacio Lula da Silva: 200 images
Class 16, Gloria Macapagal Arroyo: 200 images
Class 6, Arnold Schwarzenegger: 200 images
Class 27, Jennifer Capriati: 200 images
Class 37, Laura Bush: 200 images
Class 39, Lleyton Hewitt: 200 images
Class 0, Alejandro Toledo: 200 images
Class 20, Hans Blix: 200 images
Class 45, Nestor Kirchner: 200 images
Class 3, Andre Agassi: 200 images
Class 1, Alvaro Uribe: 200 images
Class 42, Megawati Sukarnoputri: 200 images
Class 54, Silvio Berlusconi: 200 images
Class 57, Tom Ridge

In [84]:
flattened_images = []
labels = []

# Iterate over each class in augmented_class_images
for class_id, images in augmented_class_images.items():
    # Flatten each image and append to flattened_images
    flattened_images.extend([image.flatten() for image in images])
    # Create labels for each image in the class and append to labels
    labels.extend([class_id] * len(images))

# Convert lists to numpy arrays
flattened_images_array = np.array(flattened_images)
labels_array = np.array(labels)

# Shuffle the data
shuffled_indices = np.random.permutation(len(flattened_images_array))
flattened_images_array = flattened_images_array[shuffled_indices]
labels_array = labels_array[shuffled_indices]
