Using the csv file to get the directory url for each image

In [1]:
import pandas as pd
import tensorflow as tf

In [2]:
tf.__version__

'2.7.0'

From the image below, we can note the image name always starts with "coarse_tilt_aligned_face" followed by the face_id and then the original image name, all separated by a dot. We can use this pattern to create a new column called file_dir, this will enable us to easily read the images. The user_id is the folder where the image is contained.
![image](sample.png)

In [3]:
def get_filename(data):
    dataset_dir = "D:\\The Great Big World of Machine Learning\Projects\datasets\AdienceBenchmarkGenderAndAgeClassification\\faces"
    folder = data['user_id']
    separator = '.'
    face_id = data['face_id']
    orig_image = data['original_image']
    file_name = []
    for idx in range(len(folder)):
        file_name.append(dataset_dir + '\\'+ folder[idx] + '\\'+
                         "coarse_tilt_aligned_face."+
                         str(face_id[idx]) +separator+
                         orig_image[idx])
    return file_name

In [4]:
def decode_image(filename, image_type, resize_shape, channels=3):
    value = tf.io.read_file(filename) # read the file
    if image_type == 'png':
        decoded_image = tf.io.decode_png(value, channels=channels) # decode png image
    elif image_type == 'jpg':
        decoded_image = tf.io.decode_jpeg(value, channels=channels) # decode jpg image
    else:
        decoded_image = tf.io.decode_image(value, channels=channels) # decode generic image type
    if resize_shape is not None and image_type in ['png', 'jpg']:
        decoded_image = tf.image.resize(decoded_image, resize_shape) # resize the image to specified shape

    return tf.expand_dims(decoded_image, axis=0) # add additional dimension before returning image

In [5]:
def get_dataset(image_paths, label_list, resize_shape, channels=3):
    def _map_fn(filename): # map function to convert image filenames to numpy tensor images
        return decode_image(filename, image_type='jpg', resize_shape=resize_shape, channels=channels)
    image_dataset = tf.data.Dataset.from_tensor_slices(image_paths) # create dataset from file names
    image_dataset = image_dataset.map(_map_fn) # get image files using filenames

    label_tensor = tf.constant(label_list) # convert label_list to tensor
    label_dataset = tf.data.Dataset.from_tensor_slices(label_tensor) # create dataset from label list

    full_dataset = tf.data.Dataset.zip((image_dataset, label_dataset)) # zip image dataset and label dataset

    return full_dataset

In [6]:
age_data = pd.read_csv("age_data.csv")
age_data = age_data.sample(frac=1, random_state=1).reset_index() # shuffle the dataset
print(age_data.head(3)) # preview the dataset

gender_data = pd.read_csv("gender_data.csv")
gender_data = gender_data.sample(frac=1, random_state=1).reset_index() # shuffle the dataset
print(gender_data.head()) # preview the dataset

   index        user_id                original_image  face_id        age
0   3920   10241064@N08   8315785614_59c05129c4_o.jpg      393  (38 - 48)
1  12953  113715068@N06  11857135706_7e199ca916_o.jpg     1410  (15 - 23)
2  14159  111700049@N08  11842868555_e7e14b163f_o.jpg     1547    (4 - 6)
   index        user_id                original_image  face_id gender
0   9770   63164355@N03  11019763213_35c8dcdd17_o.jpg     1097      f
1   1104   20254529@N04   9981700225_c7731daf61_o.jpg       13      f
2   9793   63164355@N03   8816062131_664ed51a2e_o.jpg     1104      m
3  13644  114776843@N02  12013388386_9d44379a27_o.jpg     1601      f
4  11786    7890646@N03  10697104793_736feee040_o.jpg     1388      f


### Age Data Partitioning
We have to divide the data into 3 parts; Training data (for training), Validation data (during training), Test data (after training)

In [7]:
# Train - 70%
train = int(0.7 * age_data.__len__()) # get number  of samples to use
# Test - 15%
test = int(0.15 * age_data.__len__()) # get number  of samples to use
# Validation - 15%
val = int(0.15 * age_data.__len__()) + 1 # an extra one is added because of error in float computation

# for each data, it starts counting from the end, then resets the index and drops unnecessary columns
test_age_data = age_data[-test:].reset_index().drop(columns=['level_0', 'index'])
val_age_data = age_data[-(test+val):-test].reset_index().drop(columns=['level_0', 'index'])
train_age_data = age_data[:train].reset_index().drop(columns=['level_0','index'])

# let's check if the partition was done correctly
if (val_age_data.__len__()+test_age_data.__len__()+train_age_data.__len__()) == age_data.__len__():
    print("Data was partitioned accurately")
else: print("Data was NOT partitioned accurately")

Data was partitioned accurately


### Gender Data Partitioning
We have to divide the data into 3 parts; Training data (for training), Validation data (during training), Test data (after training)

In [8]:
# Train - 70%
train = int(0.7 * gender_data.__len__())
# Test - 15%
test = int(0.15 * gender_data.__len__())
# Validation - 15%
val = int(0.15 * gender_data.__len__()) + 2 # an extra 2 is added because of error in float computation

# for each data, it starts counting from the end, then resets the index and drops unnecessary columns
test_gender_data = gender_data[-test:].reset_index().drop(columns=['level_0', 'index'])
val_gender_data = gender_data[-(test+val):-test].reset_index().drop(columns=['level_0', 'index'])
train_gender_data = gender_data[:train].reset_index().drop(columns=['level_0','index'])

# let's check if the partition was done correctly
if (val_gender_data.__len__()+test_gender_data.__len__()+train_gender_data.__len__()) == gender_data.__len__():
    print("Data was partitioned accurately")
else: print("Data was NOT partitioned accurately")

Data was partitioned accurately


### Save Age (train, test, val) Dataset

In [9]:
def save_to_file(data, mode, suffix):
    images = get_filename(data) # get the dir location where each image is saved
    dataset = get_dataset(images, data[mode], (64, 64)) # decode images and add to dataset
    save_path = "D:\\The Great Big World of Machine Learning\Projects\datasets\AdienceBenchmarkGenderAndAgeClassification\\TFDataset"
    tf.data.experimental.save(dataset, save_path+"\\"+mode+"\\"+suffix) # save dataset to file
    print(f"{mode} {suffix} saved")

In [10]:
# save age data
save_to_file(data=train_age_data, mode='age', suffix='train')
save_to_file(data=test_age_data, mode='age', suffix='test')
save_to_file(data=val_age_data, mode='age', suffix='val')

age train saved
age test saved
age val saved


In [11]:
# save gender data
save_to_file(data=train_gender_data, mode='gender', suffix='train')
save_to_file(data=test_gender_data, mode='gender', suffix='test')
save_to_file(data=val_gender_data, mode='gender', suffix='val')

gender train saved
gender test saved
gender val saved
