<a href="https://colab.research.google.com/github/tiffanytang34/Age_Detection_Using_Facial_Image_with_CNN/blob/main/2_dataset_prep_feature_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import cv2
from skimage.filters.rank import entropy
from skimage.morphology import disk
from skimage.filters import gaussian, sobel
from skimage.feature import canny

import os
from zipfile import ZipFile
import time
from datetime import datetime

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Dataset Preparation

In [3]:
# Unzipping the dataset file combined_faces.zip

combined_faces_zip_path = "/content/drive/My Drive/Age_Detection/combined_faces.zip"

with ZipFile(combined_faces_zip_path, 'r') as myzip:
    myzip.extractall()
    print('Done unzipping combined_faces.zip')

Done unzipping combined_faces.zip


In [4]:
# Accessing all image file names.

combined_faces_path = "/content/content/combined_faces"
combined_faces_image_names = os.listdir(combined_faces_path)

In [5]:
len(combined_faces_image_names)

33486

### Train and Test Splitting


Before performing any form of classification or feature extraction on the images, it is necessary to **split the *combined_faces* dataset into training and testing datasets**. Feature extraction will be done seperately in the trainset and testset. 

In [6]:
combined_classes = pd.read_csv("/content/drive/My Drive/Age_Detection/combined_faces_classes_summary.csv")
combined_classes

Unnamed: 0,Class label,Age-ranges (classes),No. of images,Class balance (%)
0,0,1 - 2,3192,9.53
1,1,3 - 9,2816,8.41
2,2,10 - 20,3136,9.37
3,3,21 - 25,3474,10.37
4,4,26 - 27,3217,9.61
5,5,28 - 31,3063,9.15
6,6,32 - 36,3086,9.22
7,7,37 - 45,3207,9.58
8,8,46 - 54,2802,8.37
9,9,55 - 65,2796,8.35


In [7]:
# Defining a function to return the class labels corresponding to the age-ranges shown above.

def class_labels(age):
    if 1 <= age <= 2:
        return 0
    elif 3 <= age <= 9:
        return 1
    elif 10 <= age <= 20:
        return 2
    elif 21 <= age <= 25:
        return 3
    elif 26 <= age <= 27:
        return 4
    elif 28 <= age <= 31:
        return 5
    elif 32 <= age <= 36:
        return 6
    elif 37 <= age <= 45:
        return 7
    elif 46 <= age <= 54:
        return 8
    elif 55 <= age <= 65:
        return 9
    else:
        return 10

In [8]:
# Creating a new dataframe to hold all filenames, corresponding ages and class labels.

master_df = pd.DataFrame()
master_df['filename'] = combined_faces_image_names
master_df['age'] = master_df['filename'].map(lambda img_name : np.uint8(img_name.split("_")[0]))
master_df['target'] = master_df['age'].map(class_labels)

master_df.head()

Unnamed: 0,filename,age,target
0,9_308.jpg,9,1
1,47_136.jpg,47,8
2,56_226.jpg,56,9
3,52_75.jpg,52,8
4,45_256.jpg,45,7


In [9]:
# Shuffling the rows of combined_df so as to mix together the rows coming from both subreddit datasets.

master_df = shuffle(master_df, random_state=42).reset_index(drop=True)
master_df.head()

Unnamed: 0,filename,age,target
0,37_107.jpg,37,7
1,79_19.jpg,79,10
2,35_618.jpg,35,6
3,40_478.jpg,40,7
4,62_197.jpg,62,9


In [10]:
# Defining the filenames and ages from above master_df as X, and target as y for splitting into train and test datasets later.

X = master_df[['filename', 'age']]
y = master_df['target']

In [11]:
# Splitting the dataset into training and testing datasets with test_size=0.3 and stratify=y. 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

In [12]:
X_train.shape

(23440, 2)

In [13]:
X_train.head()

Unnamed: 0,filename,age
25511,17_21.jpg,17
29767,9_285.jpg,9
12652,3_306.jpg,3
27550,40_314.jpg,40
31927,21_178.jpg,21


In [14]:
X_test.shape

(10046, 2)

In [15]:
y_train.value_counts(normalize=True)

3     0.103754
4     0.096075
7     0.095776
0     0.095307
2     0.093643
6     0.092150
5     0.091468
1     0.084087
8     0.083703
9     0.083490
10    0.080546
Name: target, dtype: float64

In [16]:
y_test.value_counts(normalize=True)

3     0.103723
4     0.096058
7     0.095760
0     0.095361
2     0.093669
6     0.092176
5     0.091479
1     0.084113
8     0.083615
9     0.083516
10    0.080530
Name: target, dtype: float64

In [17]:
# Defining a function to append the filepath to each image name as a string.

combined_faces_path = "/content/content/combined_faces"

def append_path_to_filename(filename):
    return os.path.join(combined_faces_path, filename)

In [18]:
# Creating copies of X and y (both train and test) from above to create a dataframe of filepaths to all images and their target labels.
# These dataframes will be in the deep learning models later to create dataset input pipelines using TensorFlow.data.Dataset API.

temp_X_train = X_train.copy()
temp_X_train['target'] = y_train

temp_X_test = X_test.copy()
temp_X_test['target'] = y_test

In [19]:
# Exporting the above created dataframes as CSV files.

temp_X_train.to_csv("/content/drive/My Drive/Age_Detection/images_filenames_labels_train.csv", index=False)
temp_X_test.to_csv("/content/drive/My Drive/Age_Detection/images_filenames_labels_test.csv", index=False)