Using the csv file to get the directory url for each image

In [1]:
import pandas as pd
import tensorflow as tf


In [2]:
tf.__version__

'2.7.0'

From the image below, we can note the image name always starts with "coarse_tilt_aligned_face" followed by the face_id and then the original image name, all separated by a dot. We can use this pattern to create a new column called file_dir, this will enable us to easily read the images. The user_id is the folder where the image is contained.
![image](sample.png)

In [25]:
from os import walk
def get_label_list(dataset, data, dataset_dir):
    file_names = "coarse_tilt_aligned_face." \
                    + dataset['face_id'].astype(str) \
                    + '.'+dataset['original_image']

    label_list = []
    # because of the nature of tensorflow's image_dataset_from_directory function, the file order have to be rearranged
    for (_,_,files) in walk(dataset_dir, topdown=True):
        for count, file in enumerate(files):
            data_value = dataset[file_names == file][data] # get age (for age data) or gender (for gender data)
            if data_value.any(): # if data_value is found, add to label_list
                label_list.append(data_value.to_numpy()[0])

            # code to show verbose
            completion_percentage = ((count + 1) / files.__len__()) * 100
            print(f'Getting label_list - analyzing {data} images: {round(completion_percentage, 2)}%', end='\r')
        print()  # clear carriage

    '''
    dataset_dir = "D:\\The Great Big World of Machine Learning\Projects\datasets\AdienceBenchmarkGenderAndAgeClassification\\faces"
    folder = data['user_id']
    separator = '.'
    face_id = data['face_id']
    orig_image = data['original_image']
    file_name = []
    for idx in range(len(folder)):
        file_name.append(dataset_dir + '\\'+ folder[idx] + '\\'+
                         "coarse_tilt_aligned_face."+
                         str(face_id[idx]) +separator+
                         orig_image[idx])

    return file_name
    '''
    return label_list

In [35]:
from pickle import dump
from sklearn.preprocessing import LabelEncoder
def save_to_file(dataset, data):
    save_path = "D:\\The Great Big World of Machine Learning\Projects\datasets\AdienceBenchmarkGenderAndAgeClassification\\ImgGenDataset"

    # decode images and add to dataset
    dataset_path = "D:\The Great Big World of Machine Learning" \
                  "\Projects\datasets\AdienceBenchmarkGenderAndAgeClassification\\new_dataset" + '\\' + data
    # for every data mode, get TTV folder
    # use folder to get label_list and data_set
    for mode in ['train', 'test', 'val']:
        # get working directory
        data_dir = dataset_path + '\\' + mode
        # fit label encoder to the label list of the dataset
        label_encoder = LabelEncoder()
        label_list = get_label_list(dataset, data, dataset_dir=data_dir)
        label_encoder.fit(label_list)
        encoded_label_array = label_encoder.transform(label_list) # encode labels
        # save the encoding to disk for use in inference
        filename = save_path + '\\' + data +'_label_encoding.sav'
        dump(label_encoder, open(filename, 'wb'))
        # set subset and validation_split for image_dataset_from_directory
        if mode == 'train':
            subset = 'training'
            val_split=0.18
        elif mode == 'val':
            subset = 'validation'
            val_split=0.18
        else:
            subset = None
            val_split = None
        # label_mode = 'int' if data == 'age' else 'binary' # set label mode for tensorflow image_dataset_from_directory
        # found a bug in label_mode and class_names

        # save images as tf.data.dataset
        tf_dataset = tf.keras.utils.image_dataset_from_directory(directory=data_dir,
                                                                 labels=encoded_label_array.tolist(),
                                                                 label_mode='int',
                                                                 batch_size=1,
                                                                 image_size=(64, 64),
                                                                 shuffle=True,
                                                                 seed=7, # cuz 7 is my favourite number :-)
                                                                 subset=subset,
                                                                 validation_split=val_split)

        tf.data.experimental.save(tf_dataset, save_path+"\\"+data+"\\"+mode) # save dataset to file
        print(f"{data} {mode} saved")
        print()

In [12]:
age_data = pd.read_csv("age_data.csv")
print(age_data.head(3)) # preview the dataset
print()

gender_data = pd.read_csv("gender_data.csv")
print(gender_data.head()) # preview the dataset

        user_id                original_image  face_id        age
0  30601258@N03  10399646885_67c7d20df9_o.jpg        1  (25 - 36)
1  30601258@N03  10424815813_e94629b1ec_o.jpg        2  (25 - 36)
2  30601258@N03  10437979845_5985be4b26_o.jpg        1  (25 - 36)

        user_id                original_image  face_id gender
0  30601258@N03  10399646885_67c7d20df9_o.jpg        1      f
1  30601258@N03  10424815813_e94629b1ec_o.jpg        2      m
2  30601258@N03  10437979845_5985be4b26_o.jpg        1      f
3  30601258@N03  10437979845_5985be4b26_o.jpg        3      m
4  30601258@N03  11816644924_075c3d8d59_o.jpg        2      m


### Age and Gender Data Partitioning
We have to divide the data into 3 parts; Training data (for training), Validation data (during training), Test data (after training).Thishas already been donewhen the data was mvoed into the train, test, val folders (reference the 'Rearrange_Image_Files.py' file)

### Save Age (train, test, val) Dataset

In [29]:
# save age data
save_to_file(dataset=age_data, data='age')


Getting label_list - analyzing age images: 100.0%
Found 13035 files belonging to 1 classes.
Using 10689 files for training.
age train saved


Getting label_list - analyzing age images: 100.0%
Found 2793 files belonging to 1 classes.
age test saved


Getting label_list - analyzing age images: 100.0%
Found 2794 files belonging to 1 classes.
Using 2292 files for training.
age val saved



In [36]:
# save gender data
save_to_file(dataset=gender_data, data='gender')


Getting label_list - analyzing gender images: 100.0%
Found 12244 files belonging to 1 classes.
Using 10041 files for training.
gender train saved


Getting label_list - analyzing gender images: 100.0%
Found 2623 files belonging to 1 classes.
gender test saved


Getting label_list - analyzing gender images: 100.0%
Found 2625 files belonging to 1 classes.
Using 472 files for validation.
gender val saved

