In [40]:
already_loaded = True
already_augmented = False

#### **Imports**

In [41]:
from joblib import dump, load
import numpy as np
import albumentations as A

from sklearn.utils import shuffle
from sklearn.datasets import fetch_lfw_people
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import os
import cv2
import random

#### **Loading Dataset**

In [42]:
if not already_loaded:
    
    # Load the LFW dataset in color with original image size
    lfw_people = fetch_lfw_people(color=True, resize=None, min_faces_per_person=20)

    n_samples, h, w, c = lfw_people.images.shape

    X = lfw_people.data
    n_features = X.shape[1]

    y = lfw_people.target
    target_names = lfw_people.target_names
    n_classes = target_names.shape[0]

    print("Total dataset size:")
    print("n_samples: %d" % n_samples)
    print(f"Image dimensions: {h} x {w} x {c}")
    print("n_features: %d" % n_features)
    print("n_classes: %d" % n_classes)

    dump(lfw_people, 'lfw_dataset.joblib')

else:
    # Load the dataset from the saved file
    lfw_people = load('lfw_dataset.joblib')

    n_samples, h, w, c = lfw_people.images.shape

    X = lfw_people.data
    n_features = X.shape[1]

    y = lfw_people.target
    target_names = lfw_people.target_names
    n_classes = target_names.shape[0]

    print("Total dataset size:")
    print("n_samples: %d" % n_samples)
    print(f"Image dimensions: {h} x {w} x {c}")
    print("n_features: %d" % n_features)
    print("n_classes: %d" % n_classes)

Total dataset size:
n_samples: 2489
Image dimensions: 125 x 94 x 3
n_features: 35250
n_classes: 43


#### **Train Test Split**

In [43]:
# Split data into train, validation, and test sets
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.125, random_state=42, stratify=y_train_val)

#### **Data Augmentation**

In [44]:
# Get unique classes (person IDs)
unique_classes = np.unique(y_train)

# Create a dictionary to hold separated classes
class_images = {class_id: [] for class_id in unique_classes}
images_name = {}

# Separate images into respective classes
for idx, label in enumerate(y_train):
    class_images[label].append(X_train[idx])
    images_name[label] = lfw_people.target_names[label]

# Convert lists to numpy arrays
for class_id in class_images:
    class_images[class_id] = np.array(class_images[class_id])

In [45]:
class_images_sorted = {class_id: images for class_id, images in sorted(class_images.items(), key=lambda x: len(x[1]), reverse=True)}

for class_id, images in class_images_sorted.items():
    print(f"Class {class_id}, {lfw_people.target_names[class_id]}: {images.shape[0]} images")

Class 11, George W Bush: 371 images
Class 7, Colin Powell: 165 images
Class 40, Tony Blair: 100 images
Class 9, Donald Rumsfeld: 85 images
Class 12, Gerhard Schroeder: 76 images
Class 4, Ariel Sharon: 53 images
Class 17, Hugo Chavez: 50 images
Class 26, Junichiro Koizumi: 42 images
Class 21, Jean Chretien: 38 images
Class 20, Jacques Chirac: 37 images
Class 24, John Ashcroft: 37 images
Class 36, Serena Williams: 36 images
Class 42, Vladimir Putin: 34 images
Class 13, Gloria Macapagal Arroyo: 31 images
Class 23, Jennifer Capriati: 30 images
Class 28, Laura Bush: 29 images
Class 30, Lleyton Hewitt: 29 images
Class 0, Alejandro Toledo: 27 images
Class 16, Hans Blix: 27 images
Class 3, Andre Agassi: 25 images
Class 1, Alvaro Uribe: 24 images
Class 27, Kofi Annan: 23 images
Class 31, Megawati Sukarnoputri: 23 images
Class 39, Tom Ridge: 23 images
Class 41, Vicente Fox: 23 images
Class 8, David Beckham: 22 images
Class 6, Bill Clinton: 20 images
Class 19, Jack Straw: 19 images
Class 25, Juan

In [46]:
def get_augmented_data(class_images_sorted):

    # Define Albumentations transformations
    transform = A.Compose([
        A.HorizontalFlip(p=0.5),
        A.Rotate(limit=(-20, 20), p=0.5),  # Random rotation between -30 and 30 degrees
        A.RandomBrightnessContrast(brightness_limit=(-0.2, 0.2), contrast_limit=(-0.2, 0.2), p=0.5),
        A.Blur(blur_limit=2, p=0.5),
    ])

    # Set the desired number of images per class
    desired_num_images = 200

    # Create a dictionary to hold augmented images
    augmented_class_images = {}

    # Loop through sorted_class_images
    for class_id, images in class_images_sorted.items():
        num_images = images.shape[0]
        if num_images >= desired_num_images:
            # If the class has more than or equal to 200 images, randomly select 200 images
            selected_indices = np.random.choice(num_images, size=desired_num_images, replace=False)
            selected_images = images[selected_indices]
            augmented_class_images[class_id] = selected_images
        else:
            # If the class has fewer than 200 images, augment the existing images to reach 200
            num_to_generate = desired_num_images - num_images
            # Perform data augmentation
            augmented_images = []
            for i in range(num_to_generate):
                # Flatten the 'images' array and randomly choose one image array
                selected_image_idx = np.random.choice(images.shape[0])
                selected_image = images[selected_image_idx]
                
                # Reshape the selected image to its original shape
                selected_image = selected_image.reshape((h, w, c))
                # Apply Albumentations transformations
                augmented = transform(image=selected_image)
                augmented_image = augmented["image"]
                
                # Flatten the augmented image
                flattened_augmented_image = augmented_image.flatten()
                
                # Append the flattened image to the list
                augmented_images.append(flattened_augmented_image)
                
            # Concatenate the original images and augmented images
            augmented_images = np.array(augmented_images)
            augmented_class_images[class_id] = np.concatenate((images, augmented_images), axis=0)

    return augmented_class_images

    # Now augmented_class_images dictionary contains all images separated by their respective classes, 
    # with each class having 200 images

In [47]:
augmented_class_images = get_augmented_data(class_images_sorted)

for class_id, images in augmented_class_images.items():
    print(f"Class {class_id}, {lfw_people.target_names[class_id]}: {images.shape[0]} images")

Class 11, George W Bush: 200 images
Class 7, Colin Powell: 200 images
Class 40, Tony Blair: 200 images
Class 9, Donald Rumsfeld: 200 images
Class 12, Gerhard Schroeder: 200 images
Class 4, Ariel Sharon: 200 images
Class 17, Hugo Chavez: 200 images
Class 26, Junichiro Koizumi: 200 images
Class 21, Jean Chretien: 200 images
Class 20, Jacques Chirac: 200 images
Class 24, John Ashcroft: 200 images
Class 36, Serena Williams: 200 images
Class 42, Vladimir Putin: 200 images
Class 13, Gloria Macapagal Arroyo: 200 images
Class 23, Jennifer Capriati: 200 images
Class 28, Laura Bush: 200 images
Class 30, Lleyton Hewitt: 200 images
Class 0, Alejandro Toledo: 200 images
Class 16, Hans Blix: 200 images
Class 3, Andre Agassi: 200 images
Class 1, Alvaro Uribe: 200 images
Class 27, Kofi Annan: 200 images
Class 31, Megawati Sukarnoputri: 200 images
Class 39, Tom Ridge: 200 images
Class 41, Vicente Fox: 200 images
Class 8, David Beckham: 200 images
Class 6, Bill Clinton: 200 images
Class 19, Jack Straw: 

#### **Storing Augmented Data**