In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
path_to_pictures = '/content/drive/MyDrive/pic/'

# Import libraries and Modules

In [3]:
import os
import pandas as pd
import numpy as np

# Upload Pictures

In [4]:
path = os.path.realpath(path_to_pictures)
# If the given path is a directory, open the directory and store the pictures in  a list
picture_list = []
if os.path.isdir(path):
    for file in os.listdir(path):
        if file.endswith(".jpg"):
            picture_list.append(file)

In [5]:
picture_list[0:5]

['36_0_0_20170109003426572.jpg',
 '49_1_0_20170109011101339.jpg',
 '69_0_0_20170109011208585.jpg',
 '30_1_0_20170109001620649.jpg',
 '26_1_4_20170109002629914.jpg']

# Create a Dataset from the labels

* Age
* Gender
* Ethnicity
* picture name / picture path

In [6]:
# Each picture name looks like: age_gender_ethnicity_id.jpg
print(picture_list[0])

36_0_0_20170109003426572.jpg


In [7]:
# We applied split() to get an array with the labels for the picture.
temp = picture_list[0].split('_')
print(temp)

['36', '0', '0', '20170109003426572.jpg']


In [8]:
age_labels = []
gender_labels = []
ethnicity_labels = []

for file in picture_list:
    temp = file.split("_")
    age_labels.append(temp[0])
    gender_labels.append(temp[1])
    ethnicity_labels.append(temp[2])


Convert to a dataframe

In [9]:
df = pd.DataFrame()
df["filename"] = picture_list
df["age"] = age_labels
df["gender"] = gender_labels
df["ethnicity"] = ethnicity_labels

In [10]:
# map labels
gender_dict = {0: 'Male', 1: 'Female'}
ethnicity_dict = {}

# Data inspection and Data cleaning

## Data shape

In [11]:
df.shape

(10137, 4)

In [12]:
display(df)

Unnamed: 0,filename,age,gender,ethnicity
0,36_0_0_20170109003426572.jpg,36,0,0
1,49_1_0_20170109011101339.jpg,49,1,0
2,69_0_0_20170109011208585.jpg,69,0,0
3,30_1_0_20170109001620649.jpg,30,1,0
4,26_1_4_20170109002629914.jpg,26,1,4
...,...,...,...,...
10132,70_0_1_20170111201412415.jpg,70,0,1
10133,70_0_0_20170111201419254.jpg,70,0,0
10134,39_0_0_20170111201500864.jpg,39,0,0
10135,57_0_0_20170111201514151.jpg,57,0,0


## Check data types and proper spelling / labels

In [13]:
for column in df:
  unique_values = df[column].nunique()
  print(f"\nColumn {column}, Data type: {df[column].dtype}")
  print(f"Unique values: {unique_values} ")
  print(df[column].value_counts())


Column filename, Data type: object
Unique values: 10137 
36_0_0_20170109003426572.jpg    1
1_0_2_20161219201523620.jpg     1
1_0_0_20161219195753899.jpg     1
1_0_2_20161219203236876.jpg     1
1_0_0_20161219200250923.jpg     1
                               ..
14_0_0_20170104012341136.jpg    1
14_0_3_20170104012427337.jpg    1
14_0_2_20170104012541763.jpg    1
12_1_2_20170104012405785.jpg    1
50_0_0_20170111201507166.jpg    1
Name: filename, Length: 10137, dtype: int64

Column age, Data type: object
Unique values: 99 
1      1271
2       524
3       303
26      280
4       270
       ... 
110       3
93        3
99        2
91        1
101       1
Name: age, Length: 99, dtype: int64

Column gender, Data type: object
Unique values: 2 
1    5598
0    4539
Name: gender, dtype: int64

Column ethnicity, Data type: object
Unique values: 5 
0    5396
2    1703
3    1494
4    1121
1     423
Name: ethnicity, dtype: int64


It should be possible to cast from string to integer for age, gender and ethnicity labels.

In [14]:
def correct_picture(column_name, row_index, correct_age, correct_gender, correct_ethnicity):
    temp = df[column_name][row_index].split('_')
    new_filename = '{}_{}_{}_{}'.format(correct_age, correct_gender, correct_ethnicity, temp[-1])

    # rename file in the system
    old_name = f'pic/{df["filename"][row_index]}'
    new_name = f'pic/{new_filename}'
    os.rename(old_name, new_name)

    # rename file in dataset
    df["filename"][row_index] = new_filename
    df["age"] = correct_age
    df["gender"][row_index] = correct_gender
    df["ethnicity"][row_index] = correct_ethnicity

In [15]:
for column in df:
    if column != "filename":
        try: # Find out which columns have problems
            df[column].astype('float64')
        except ValueError as err:
            print(f'\nColumn " {column} " could not be converted: {err!s}')
            # Find out which rows have problems
            for row_index, value in enumerate(df[column]):
                if value is not None:
                    try:
                        int(value)
                    except ValueError as err:
                        print(f'Row {row_index} in column "{column}" with file name "{df["filename"][row_index]}" could not be converted: {err!s}')
                        if (df["filename"][row_index] == "61_1_20170109142408075.jpg" or df["filename"][row_index] == "61_3_20170109150557335.jpg"):
                            temp = df["filename"][row_index].split('_')
                            correct_picture(column, row_index, temp[0], 1, temp[1])

Filename should have the pattern: age_gender_ethnicity_id.jpg

We found the following incidences: 
* Picture 61_1_20170109142408075.jpg is a 
black woman. Picture will be renamed to 61_1_1_20170109142408075.jpg
* Picture 61_3_20170109150557335.jpg is an indian woman. Picture will be renamed to 61_1_3_20170109150557335.jpg

In [16]:
# Check that corrections have been implemented
for column in df:
  unique_values = df[column].nunique()
  print(f"\nColumn {column}, Data type: {df[column].dtype}")
  print(f"Unique values: {unique_values} ")
  print(df[column].value_counts())


Column filename, Data type: object
Unique values: 10137 
36_0_0_20170109003426572.jpg    1
1_0_2_20161219201523620.jpg     1
1_0_0_20161219195753899.jpg     1
1_0_2_20161219203236876.jpg     1
1_0_0_20161219200250923.jpg     1
                               ..
14_0_0_20170104012341136.jpg    1
14_0_3_20170104012427337.jpg    1
14_0_2_20170104012541763.jpg    1
12_1_2_20170104012405785.jpg    1
50_0_0_20170111201507166.jpg    1
Name: filename, Length: 10137, dtype: int64

Column age, Data type: object
Unique values: 99 
1      1271
2       524
3       303
26      280
4       270
       ... 
110       3
93        3
99        2
91        1
101       1
Name: age, Length: 99, dtype: int64

Column gender, Data type: object
Unique values: 2 
1    5598
0    4539
Name: gender, dtype: int64

Column ethnicity, Data type: object
Unique values: 5 
0    5396
2    1703
3    1494
4    1121
1     423
Name: ethnicity, dtype: int64


## Analyze NaN values

In [17]:
# Check for the presence of NaN values in percentage
percentageNAN = round((((df.isnull().sum()).sum() / np.product(df.shape)) * 100), 2)
print(f"There are {percentageNAN}% of NaN values.")

There are 0.0% of NaN values.


In [18]:
# Number of missing values per column
print(df.isnull().sum())

filename     0
age          0
gender       0
ethnicity    0
dtype: int64


## Check for double values

In [20]:
df.duplicated().any()

False

# Store dataframe in .csv file

In [19]:
df.to_csv('dataset_faces.csv', index=False)