In [43]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [44]:
path_to_pictures = '/content/drive/MyDrive/pic/'

# Import libraries and Modules

In [45]:
import os
import pandas as pd
import numpy as np

# Upload Pictures

In [65]:
path = os.path.realpath(path_to_pictures)
# If the given path is a directory, open the directory and store the pictures in  a list
picture_list = []
if os.path.isdir(path):
    for file in os.listdir(path):
        if file.endswith(".jpg"):
            picture_list.append(file)

In [47]:
picture_list[0:5]

['36_1_1_20170116155035948.jpg',
 '36_1_1_20170116014305365.jpg',
 '36_1_1_20170116022026435.jpg',
 '36_1_1_20170116153744623.jpg',
 '36_1_1_20170116161213859.jpg']

# Create a Dataset from the labels

* Age
* Gender
* Ethnicity
* picture name / picture path

In [48]:
# Each picture name looks like: age_gender_ethnicity_id.jpg
print(picture_list[0])

36_1_1_20170116155035948.jpg


In [49]:
# We applied split() to get an array with the labels for the picture.
temp = picture_list[0].split('_')
print(temp)

['36', '1', '1', '20170116155035948.jpg']


In [66]:
age_labels = []
gender_labels = []
ethnicity_labels = []

for file in picture_list:
    temp = file.split("_")
    age_labels.append(temp[0])
    gender_labels.append(temp[1])
    ethnicity_labels.append(temp[2])


Convert to a dataframe

In [67]:
df = pd.DataFrame()
df["filename"] = picture_list
df["age"] = age_labels
df["gender"] = gender_labels
df["ethnicity"] = ethnicity_labels

In [68]:
# map labels
gender_dict = {0: 'Male', 1: 'Female'}
ethnicity_dict = {}

# Data inspection and Data cleaning

## Data shape

In [69]:
df.shape

(24105, 4)

In [70]:
display(df)

Unnamed: 0,filename,age,gender,ethnicity
0,36_1_1_20170116155035948.jpg,36,1,1
1,36_1_1_20170116014305365.jpg,36,1,1
2,36_1_1_20170116022026435.jpg,36,1,1
3,36_1_1_20170116153744623.jpg,36,1,1
4,36_1_1_20170116161213859.jpg,36,1,1
...,...,...,...,...
24100,53_0_0_20170111200949284.jpg,53,0,0
24101,52_0_0_20170111200958422.jpg,52,0,0
24102,80_0_0_20170111200927317.jpg,80,0,0
24103,42_0_1_20170111200932454.jpg,42,0,1


## Check data types and proper spelling / labels

In [55]:
for column in df:
  unique_values = df[column].nunique()
  print(f"\nColumn {column}, Data type: {df[column].dtype}")
  print(f"Unique values: {unique_values} ")
  print(df[column].value_counts())


Column filename, Data type: object
Unique values: 24105 
36_1_1_20170116155035948.jpg    1
37_1_0_20170104171729234.jpg    1
40_0_0_20170104202600778.jpg    1
43_0_0_20170104205149195.jpg    1
32_0_0_20170104202239802.jpg    1
                               ..
39_0_0_20170113183735128.jpg    1
38_1_1_20170113183305653.jpg    1
38_1_1_20170113184328127.jpg    1
40_0_1_20170113182334142.jpg    1
69_0_0_20170111200942998.jpg    1
Name: filename, Length: 24105, dtype: int64

Column age, Data type: object
Unique values: 104 
26     2206
1      1282
28      921
35      881
24      861
       ... 
115       3
91        2
101       2
103       1
111       1
Name: age, Length: 104, dtype: int64

Column gender, Data type: object
Unique values: 2 
0    12581
1    11524
Name: gender, dtype: int64

Column ethnicity, Data type: object
Unique values: 5 
0    10221
1     4560
3     4028
2     3586
4     1710
Name: ethnicity, dtype: int64


It should be possible to cast from string to integer for age, gender and ethnicity labels.

In [56]:
def correct_picture(column_name, row_index, correct_age, correct_gender, correct_ethnicity):
    temp = df[column_name][row_index].split('_')
    new_filename = '{}_{}_{}_{}'.format(correct_age, correct_gender, correct_ethnicity, temp[-1])

    # rename file in the system
    old_name = f'{path_to_pictures}{df["filename"][row_index]}'
    new_name = f'{path_to_pictures}{new_filename}'
    os.rename(old_name, new_name)

    # rename file in dataset
    df["filename"][row_index] = new_filename
    df["age"] = correct_age
    df["gender"][row_index] = correct_gender
    df["ethnicity"][row_index] = correct_ethnicity

In [63]:
for column in df:
    if column != "filename":
        try: # Find out which columns have problems
            df[column].astype('float64')
        except ValueError as err:
            print(f'\nColumn " {column} " could not be converted: {err!s}')
            # Find out which rows have problems
            for row_index, value in enumerate(df[column]):
                if value is not None:
                    try:
                      int(value)
                    except ValueError as err:
                      print(f'Row {row_index} in column "{column}" with file name "{df["filename"][row_index]}" could not be converted: {err!s}')
                      if (df["filename"][row_index] == "61_1_20170109142408075.jpg" or df["filename"][row_index] == "61_3_20170109150557335.jpg" or df["filename"][row_index] == "39_1_20170116174525125.jpg" or df["filename"][row_index] == "53__0_20170116184028385.jpg"):
                        temp = df["filename"][row_index].split('_')
                        correct_picture(column, row_index, temp[0], 1, temp[1])

Filename should have the pattern: age_gender_ethnicity_id.jpg

We found the following incidences: 
* Picture 39_1_20170116174525125.jpg is a black woman. Picture will be renamed as 39_1_1_20170116174525125.jpg
* Picture 53__0_20170116184028385.jpg is a white woman. Picture will be renamed as 53_1_0_20170116184028385.jpg
* Picture 61_1_20170109142408075.jpg is a 
black woman. Picture will be renamed to 61_1_1_20170109142408075.jpg
* Picture 61_3_20170109150557335.jpg is an indian woman. Picture will be renamed to 61_1_3_20170109150557335.jpg

In [64]:
# Check that corrections have been implemented
for column in df:
  unique_values = df[column].nunique()
  print(f"\nColumn {column}, Data type: {df[column].dtype}")
  print(f"Unique values: {unique_values} ")
  print(df[column].value_counts())


Column filename, Data type: object
Unique values: 24105 
36_1_1_20170116155035948.jpg    1
37_1_0_20170104171729234.jpg    1
40_0_0_20170104202600778.jpg    1
43_0_0_20170104205149195.jpg    1
32_0_0_20170104202239802.jpg    1
                               ..
39_0_0_20170113183735128.jpg    1
38_1_1_20170113183305653.jpg    1
38_1_1_20170113184328127.jpg    1
40_0_1_20170113182334142.jpg    1
69_0_0_20170111200942998.jpg    1
Name: filename, Length: 24105, dtype: int64

Column age, Data type: object
Unique values: 104 
26     2206
1      1282
28      921
35      881
24      861
       ... 
115       3
91        2
101       2
103       1
111       1
Name: age, Length: 104, dtype: int64

Column gender, Data type: object
Unique values: 2 
0    12581
1    11524
Name: gender, dtype: int64

Column ethnicity, Data type: object
Unique values: 5 
0    10221
1     4560
3     4028
2     3586
4     1710
Name: ethnicity, dtype: int64


## Analyze NaN values

In [59]:
# Check for the presence of NaN values in percentage
percentageNAN = round((((df.isnull().sum()).sum() / np.product(df.shape)) * 100), 2)
print(f"There are {percentageNAN}% of NaN values.")

There are 0.0% of NaN values.


In [60]:
# Number of missing values per column
print(df.isnull().sum())

filename     0
age          0
gender       0
ethnicity    0
dtype: int64


## Check for double values

In [61]:
df.duplicated().any()

False

# Store dataframe in .csv file

In [62]:
df.to_csv('dataset_faces.csv', index=False)