<a href="https://colab.research.google.com/github/vipin-jangra/face-age-estimation-CNN/blob/main/Utk_Dataset_preprocess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import os
from zipfile import ZipFile
import shutil
import pandas as pd
from PIL import Image
from sklearn.model_selection import train_test_split

In [6]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
#Unzipping the dataset file combined_faces.zip

faces_zip_path = "/content/drive/MyDrive/Dataset/Dataset2/crop_part1.zip";

with ZipFile(faces_zip_path, 'r') as myzip:
    myzip.extractall()
    print('Done unzipping faces.zip')

Done unzipping faces.zip


In [8]:
# Define age ranges
age_ranges = [(1, 2), (3, 9), (10, 20), (21, 27), (28, 45), (46, 65), (66, 116)]

In [9]:
def categorize_age(age):
    for range_name, (start, end) in enumerate(age_ranges):
        if start <= age <= end:
            return range_name
    return None

In [10]:
dataset_dir = '/content/crop_part1'
# Get the list of all image file paths

image_paths = [os.path.join(dataset_dir, fname) for fname in os.listdir(dataset_dir) if fname.endswith('.jpg')]
print(f"Images shape: {len(image_paths)}")

Images shape: 9780


In [11]:
dataset_path = '/content/crop_part1'  # Change this to your dataset folder
# Extract age labels from filenames
ages = [int(fname.split('_')[0]) for fname in os.listdir(dataset_path) if fname.endswith('.jpg')]
print(f"Ages shape: {len(ages)}")

Ages shape: 9780


In [12]:
# Categorize ages into ranges
age_categories = [categorize_age(age) for age in ages]

In [13]:
import collections
# Function to print the distribution of age ranges
def print_age_distribution(labels, dataset_type):
    counter = collections.Counter(labels)
    print(f"{dataset_type} Age Distribution:")
    for age_range, count in counter.items():
        print(f"Age range {age_range}: {count} samples")

In [16]:


def preprocess_and_split_dataset(dataset_path, output_path, resize_dim=(224, 224), test_size=0.2):
    # Create output directories for training and testing
    train_dir = os.path.join(output_path, 'train')
    test_dir = os.path.join(output_path, 'test')
    os.makedirs(train_dir, exist_ok=True)
    os.makedirs(test_dir, exist_ok=True)


    # Preprocess and resize images
    processed_images = []
    for img_path in image_paths:
        try:
            img = Image.open(img_path)  # Open the image
            img = img.convert('L')  # Convert to grayscale
            img = img.resize(resize_dim)  # Resize image
            processed_images.append((img, os.path.basename(img_path)))  # Store processed image and filename
        except Exception as e:
            print(f"Error processing {img_path}: {e}")

    # Split the dataset into training and testing sets
    train_images, test_images, train_labels, test_labels = train_test_split(
        image_paths, age_categories,stratify=age_categories, test_size=test_size, random_state=42
    )

        # Print distribution for training set
    print_age_distribution(train_labels, "Training")

    # Print distribution for validation set
    print_age_distribution(test_labels, "Validation")

    # Save training images and create train CSV
    train_data = []
    for img_path, label in zip(train_images, train_labels):
        try:
            img = Image.open(img_path)  # Open the image
            img = img.convert('L')  # Convert to grayscale
            img = img.resize(resize_dim)  # Resize image
            img.save(os.path.join(train_dir, os.path.basename(img_path)))  # Save image to train folder
            train_data.append({'filename': os.path.basename(img_path), 'age': label})
        except Exception as e:
            print(f"Error processing {img_path}: {e}")

    train_df = pd.DataFrame(train_data)
    train_df.to_csv(os.path.join(output_path, 'train_labels.csv'), index=False)

    # Save testing images and create test CSV
    test_data = []
    for img_path, label in zip(test_images, test_labels):
        try:
            img = Image.open(img_path)  # Open the image
            img = img.convert('L')  # Convert to grayscale
            img = img.resize(resize_dim)  # Resize image
            img.save(os.path.join(test_dir, os.path.basename(img_path)))  # Save image to test folder
            test_data.append({'filename': os.path.basename(img_path), 'age': label})
        except Exception as e:
            print(f"Error processing {img_path}: {e}")

    test_df = pd.DataFrame(test_data)
    test_df.to_csv(os.path.join(output_path, 'test_labels.csv'), index=False)


    print(f"Dataset processed and split into '{train_dir}' and '{test_dir}' with labels saved as CSV.")




# Example usage

output_path = '/content/UTKDataset'   # Change this to your output folder
preprocess_and_split_dataset(dataset_path, output_path)


Training Age Distribution:
Age range 6: 800 samples
Age range 1: 1087 samples
Age range 5: 1347 samples
Age range 2: 1057 samples
Age range 4: 1368 samples
Age range 0: 1270 samples
Age range 3: 895 samples
Validation Age Distribution:
Age range 5: 337 samples
Age range 4: 342 samples
Age range 0: 317 samples
Age range 1: 272 samples
Age range 3: 224 samples
Age range 6: 200 samples
Age range 2: 264 samples
Dataset processed and split into '/content/UTKDataset/train' and '/content/UTKDataset/test' with labels saved as CSV.


In [17]:
import shutil

# Path to the output folder where 'train' and 'test' directories are created
output_path = '/content/UTKDataset'  # Change this to your output folder path
shutil.make_archive(output_path, 'zip', output_path)

from google.colab import files
files.download(output_path + '.zip')  # Download the zip file


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>