Similar Jupyter Notebooks:
* https://www.kaggle.com/code/karan842/age-and-gender-detection-by-cnn

Videos:
* https://www.youtube.com/watch?v=vEJzsGXrB70
* https://www.youtube.com/watch?v=9AnCNBL8c6Q


# Import libraries and Modules

In [18]:
import os
import pandas as pd
import numpy as np

# Upload Pictures

In [19]:
pic_path = "pic"
path = os.path.realpath(pic_path)
# If the given path is a directory, open the directory and store the pictures in  a list
picture_list = []
if os.path.isdir(path):
    for file in os.listdir(path):
        if file.endswith(".jpg"):
            picture_list.append(file)

In [20]:
for picture in picture_list:
    print(picture)

100_1_0_20170110183726390.jpg
100_1_2_20170105174847679.jpg
100_1_2_20170110182836729.jpg
101_1_2_20170105174739309.jpg
10_0_0_20161220222308131.jpg
10_0_0_20170103200329407.jpg
10_0_0_20170103200522151.jpg
10_0_0_20170103233459275.jpg
10_0_0_20170104013211746.jpg
10_0_0_20170110215927291.jpg
10_0_0_20170110220033115.jpg
10_0_0_20170110220111082.jpg
10_0_0_20170110220235233.jpg
10_0_0_20170110220251986.jpg
10_0_0_20170110220255346.jpg
10_0_0_20170110220316298.jpg
10_0_0_20170110220403810.jpg
10_0_0_20170110220447314.jpg
10_0_0_20170110220503946.jpg
10_0_0_20170110220514186.jpg
10_0_0_20170110220530650.jpg
10_0_0_20170110220539329.jpg
10_0_0_20170110220541850.jpg
10_0_0_20170110220546177.jpg
10_0_0_20170110220548521.jpg
10_0_0_20170110220557169.jpg
10_0_0_20170110220644705.jpg
10_0_0_20170110220654150.jpg
10_0_0_20170110221714752.jpg
10_0_0_20170110221719390.jpg
10_0_0_20170110221811823.jpg
10_0_0_20170110224223937.jpg
10_0_0_20170110224238891.jpg
10_0_0_20170110224253445.jpg
10_0_0_201

# Create a Dataset from the labels

* Age
* Gender
* Ethnicity
* picture name / picture path

In [21]:
# Each picture name looks like:
print(picture_list[0])
# age_gender_ethnicity_id.jpg

100_1_0_20170110183726390.jpg


In [22]:
# We applied split() to get an array with the labels for the picture.
temp = picture_list[0].split('_')
print(temp)

['100', '1', '0', '20170110183726390.jpg']


In [23]:
age_labels = []
gender_labels = []
ethnicity_labels = []

for file in picture_list:
    temp = file.split("_")
    age_labels.append(temp[0])
    gender_labels.append(temp[1])
    ethnicity_labels.append(temp[2])


Convert to a dataframe

In [24]:
df = pd.DataFrame()
df["filename"] = picture_list
df["age"] = age_labels
df["gender"] = gender_labels
df["ethnicity"] = ethnicity_labels

In [25]:
# map labels
gender_dict = {0: 'Male', 1: 'Female'}
ethnicity_dict = {}

# Data inspection and Data cleaning

## Data shape

In [26]:
df.shape

(10137, 4)

In [27]:
display(df)

Unnamed: 0,filename,age,gender,ethnicity
0,100_1_0_20170110183726390.jpg,100,1,0
1,100_1_2_20170105174847679.jpg,100,1,2
2,100_1_2_20170110182836729.jpg,100,1,2
3,101_1_2_20170105174739309.jpg,101,1,2
4,10_0_0_20161220222308131.jpg,10,0,0
...,...,...,...,...
10132,9_1_3_20161220222856346.jpg,9,1,3
10133,9_1_3_20170104222949455.jpg,9,1,3
10134,9_1_4_20170103200637399.jpg,9,1,4
10135,9_1_4_20170103200814791.jpg,9,1,4


## Check data types and proper spelling / labels

In [28]:
for column in df:
  unique_values = df[column].nunique()
  print(f"\nColumn {column}, Data type: {df[column].dtype}")
  print(f"Unique values: {unique_values} ")
  print(df[column].value_counts())


Column filename, Data type: object
Unique values: 10137 
36_0_0_20170104201153985.jpg    1
48_1_3_20170109141805054.jpg    1
2_1_0_20170109193339281.jpg     1
24_1_3_20170104223152030.jpg    1
40_1_0_20170105164619947.jpg    1
                               ..
24_0_2_20170112003933482.jpg    1
5_0_3_20170104230622017.jpg     1
56_1_3_20170109132251210.jpg    1
1_1_2_20161219200040507.jpg     1
1_0_3_20161219225303736.jpg     1
Name: filename, Length: 10137, dtype: int64

Column age, Data type: object
Unique values: 99 
1      1271
2       524
3       303
26      280
4       270
       ... 
93        3
110       3
99        2
101       1
91        1
Name: age, Length: 99, dtype: int64

Column gender, Data type: object
Unique values: 2 
1    5598
0    4539
Name: gender, dtype: int64

Column ethnicity, Data type: object
Unique values: 5 
0    5396
2    1703
3    1494
4    1121
1     423
Name: ethnicity, dtype: int64


It should be possible to cast from string to integer for age, gender and ethnicity labels.

In [29]:
def correct_picture(column_name, row_index, correct_age, correct_gender, correct_ethnicity):
    temp = df[column_name][row_index].split('_')
    new_filename = '{}_{}_{}_{}'.format(correct_age, correct_gender, correct_ethnicity, temp[-1])

    # rename file in the system
    old_name = f'pic/{df["filename"][row_index]}'
    new_name = f'pic/{new_filename}'
    os.rename(old_name, new_name)

    # rename file in dataset
    df["filename"][row_index] = new_filename
    df["age"] = correct_age
    df["gender"][row_index] = correct_gender
    df["ethnicity"][row_index] = correct_ethnicity

In [30]:
for column in df:
    if column != "filename":
        try: # Find out which columns have problems
            df[column].astype('float64')
        except ValueError as err:
            print(f'\nColumn " {column} " could not be converted: {err!s}')
            # Find out which rows have problems
            for row_index, value in enumerate(df[column]):
                if value is not None:
                    try:
                        int(value)
                    except ValueError as err:
                        print(f'Row {row_index} in column "{column}" with file name "{df["filename"][row_index]}" could not be converted: {err!s}')
                        if (df["filename"][row_index] == "61_1_20170109142408075.jpg" or df["filename"][row_index] == "61_3_20170109150557335.jpg"):
                            temp = df["filename"][row_index].split('_')
                            correct_picture(column, row_index, temp[0], 1, temp[1])

Filename should have the pattern: age_gender_ethnicity_id.jpg
Picture 61_1_20170109142408075.jpg is a black woman. Picture will be renamed to 61_1_1_20170109142408075.jpg
Picture 61_3_20170109150557335.jpg is an indian woman. Picture will be renamed to 61_1_3_20170109150557335.jpg

In [31]:
# Check that corrections have been implemented
for column in df:
  unique_values = df[column].nunique()
  print(f"\nColumn {column}, Data type: {df[column].dtype}")
  print(f"Unique values: {unique_values} ")
  print(df[column].value_counts())


Column filename, Data type: object
Unique values: 10137 
36_0_0_20170104201153985.jpg    1
48_1_3_20170109141805054.jpg    1
2_1_0_20170109193339281.jpg     1
24_1_3_20170104223152030.jpg    1
40_1_0_20170105164619947.jpg    1
                               ..
24_0_2_20170112003933482.jpg    1
5_0_3_20170104230622017.jpg     1
56_1_3_20170109132251210.jpg    1
1_1_2_20161219200040507.jpg     1
1_0_3_20161219225303736.jpg     1
Name: filename, Length: 10137, dtype: int64

Column age, Data type: object
Unique values: 99 
1      1271
2       524
3       303
26      280
4       270
       ... 
93        3
110       3
99        2
101       1
91        1
Name: age, Length: 99, dtype: int64

Column gender, Data type: object
Unique values: 2 
1    5598
0    4539
Name: gender, dtype: int64

Column ethnicity, Data type: object
Unique values: 5 
0    5396
2    1703
3    1494
4    1121
1     423
Name: ethnicity, dtype: int64


## Analyze NaN values

In [32]:
# Check for the presence of NaN values in percentage
percentageNAN = round((((df.isnull().sum()).sum() / np.product(df.shape)) * 100), 2)
print(f"There are {percentageNAN}% of NaN values.")

There are 0.0% of NaN values.


In [33]:
# Number of missing values per column
print(df.isnull().sum())

filename     0
age          0
gender       0
ethnicity    0
dtype: int64


# Store dataframe in .csv file

In [34]:
df.to_csv('dataset_faces.csv', index=False)