<a href="https://www.kaggle.com/code/malakalaa2004/data-preprocesion-depi?scriptVersionId=269772645" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip uninstall -y pillow torchvision
!pip install pillow==9.5.0 torchvision==0.15.2
!pip install facenet-pytorch
import os; os.kill(os.getpid(), 9)  # force restart


In [None]:
import hashlib
import shutil
from pathlib import Path
from PIL import Image
from tqdm.notebook import tqdm
import torch
from torchvision import transforms
from facenet_pytorch import MTCNN
from sklearn.model_selection import train_test_split
import os

# Correct dataset root path based on paste.txt
base_dir = '/kaggle/input/lfw-dataset/lfw-deepfunneled/lfw-deepfunneled'
processed_dir = '/kaggle/working/processed_faces'
train_dir = '/kaggle/working/data_split/train'
val_dir = '/kaggle/working/data_split/val'
val_split = 0.2

device = 'cuda' if torch.cuda.is_available() else 'cpu'
mtcnn = MTCNN(image_size=160, margin=10, min_face_size=10, device=device)

augment = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(10),
    transforms.ColorJitter(brightness=0.2, contrast=0.2),
])

shutil.rmtree(processed_dir, ignore_errors=True)
os.makedirs(processed_dir, exist_ok=True)

# Test face detection on sample image to verify correct paths and model working
def test_face_detection(image_path):
    img = Image.open(image_path).convert('RGB')
    boxes, probs = mtcnn.detect(img)
    print(f"Detected boxes for {image_path}: {boxes}")

test_face_detection('/kaggle/input/lfw-dataset/lfw-deepfunneled/lfw-deepfunneled/Aaron_Eckhart/Aaron_Eckhart_0001.jpg')

# Step 1: Detect, crop, align faces, save
def save_faces(data_dir, save_dir):
    image_paths = list(Path(data_dir).rglob('*.jpg'))
    for img_path in tqdm(image_paths, desc="Processing images"):
        person_name = img_path.parent.name
        save_person_dir = Path(save_dir) / person_name
        save_person_dir.mkdir(parents=True, exist_ok=True)
        try:
            img = Image.open(img_path).convert('RGB')
            face = mtcnn(img)
            if face is not None:
                face_img = transforms.ToPILImage()(face)
                dst_path = save_person_dir / img_path.name
                face_img.save(dst_path)
                print(f"Saved face to {dst_path}")
            else:
                print(f"No face detected in {img_path}")
        except Exception as e:
            print(f"Error processing {img_path}: {e}")
            continue

save_faces(base_dir, processed_dir)

num_processed = sum(1 for _ in Path(processed_dir).rglob('*.jpg'))
print(f"Number of processed face images: {num_processed}")

# Step 2: Augmentation
def augment_faces(src_dir, num_aug=2):
    for person_dir in Path(src_dir).iterdir():
        for img_path in person_dir.glob('*.jpg'):
            img = Image.open(img_path)
            for i in range(num_aug):
                img_aug = augment(img)
                save_aug_dir = Path(src_dir) / person_dir.name
                save_aug_dir.mkdir(parents=True, exist_ok=True)
                img_aug.save(save_aug_dir / f"{img_path.stem}_aug{i}{img_path.suffix}")

augment_faces(processed_dir)

# Step 3: Remove duplicates
def remove_duplicates(folder):
    seen_hashes = set()
    num_removed = 0
    for person_dir in Path(folder).iterdir():
        if not person_dir.is_dir():
            continue
        for img_path in list(person_dir.glob('*.jpg')):
            with open(img_path, 'rb') as f:
                filehash = hashlib.md5(f.read()).hexdigest()
            if filehash in seen_hashes:
                img_path.unlink()
                num_removed += 1
            else:
                seen_hashes.add(filehash)
    print(f"Removed {num_removed} duplicate images.")

remove_duplicates(processed_dir)

# Step 4: Train/val split
def split_train_val(src_dir, train_dir, val_dir, val_split=0.2):
    for person_dir in Path(src_dir).iterdir():
        img_paths = list(person_dir.glob('*.jpg'))
        if len(img_paths) < 2:
            continue
        train_imgs, val_imgs = train_test_split(img_paths, test_size=val_split, random_state=42)
        for t in train_imgs:
            out_dir = Path(train_dir) / person_dir.name
            out_dir.mkdir(parents=True, exist_ok=True)
            shutil.copy(t, out_dir / t.name)
        for v in val_imgs:
            out_dir = Path(val_dir) / person_dir.name
            out_dir.mkdir(parents=True, exist_ok=True)
            shutil.copy(v, out_dir / v.name)

split_train_val(processed_dir, train_dir, val_dir, val_split)

print("Clean and structured dataset ready for training.")
print(f"Train images in {train_dir}")
print(f"Validation images in {val_dir}")
