In this book we import the names of the pictures, interpret the labels and create a dataframe. We also make sure that the labels are correct.

# Import libraries and Modules

In [44]:
#path_to_pictures = '/content/drive/MyDrive/pic/'
path_to_pictures = '../pics/orginal_pics'

In [45]:
# Download files from google drive
! pip install gdown
# Link to part1: https://drive.google.com/file/d/1mb5Z24TsnKI3ygNIlX6ZFiwUj0_PmpAW/view?usp=share_link
! gdown -O ../pics/orginal_pics/part1.tar "1mb5Z24TsnKI3ygNIlX6ZFiwUj0_PmpAW"
# Link to part2: https://drive.google.com/file/d/19vdaXVRtkP-nyxz1MYwXiFsh_m_OL72b/view?usp=share_link
! gdown -O ../pics/orginal_pics/part2.tar "19vdaXVRtkP-nyxz1MYwXiFsh_m_OL72b"
# Link to part3: https://drive.google.com/file/d/1oj9ZWsLV2-k2idoW_nRSrLQLUP3hus3b/view?usp=share_link
! gdown -O ../pics/orginal_pics/part3.tar "1oj9ZWsLV2-k2idoW_nRSrLQLUP3hus3b"




[notice] A new release of pip available: 22.3 -> 22.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip
Downloading...
From: https://drive.google.com/uc?id=1mb5Z24TsnKI3ygNIlX6ZFiwUj0_PmpAW
To: C:\Users\vio_g\CVNLP\CV_project\pics\orginal_pics\part1.tar

  0%|          | 0.00/874M [00:00<?, ?B/s]
  0%|          | 524k/874M [00:00<03:29, 4.18MB/s]
  0%|          | 1.57M/874M [00:00<02:12, 6.56MB/s]
  0%|          | 2.62M/874M [00:00<01:49, 7.96MB/s]
  0%|          | 3.67M/874M [00:00<02:10, 6.69MB/s]
  1%|          | 5.24M/874M [00:00<01:47, 8.06MB/s]
  1%|          | 6.29M/874M [00:00<01:41, 8.58MB/s]
  1%|          | 7.34M/874M [00:00<01:38, 8.83MB/s]
  1%|          | 8.39M/874M [00:01<01:36, 8.99MB/s]
  1%|1         | 9.44M/874M [00:01<01:38, 8.79MB/s]
  1%|1         | 10.5M/874M [00:01<01:36, 8.95MB/s]
  1%|1         | 12.1M/874M [00:01<01:25, 10.1MB/s]
  2%|1         | 13.6M/874M [00:01<01:22, 10.4MB/s]
  2%|1         | 15.2M/874M [00:01<01:20, 10.7MB/s]
  2%|1   

In [46]:
import os
import pandas as pd
import numpy as np

In [47]:
# Extract files
import tarfile
try:
  tar = tarfile.open(path_to_pictures + '/part1.tar')
  tar.extractall(path=path_to_pictures)
  tar.close()
except:
  print("Pictures could not be extracted for part 1")

try:
  tar = tarfile.open(path_to_pictures + '/part2.tar')
  tar.extractall(path=path_to_pictures)
  tar.close()
except:
  print("Pictures could not be extracted for part2")

try:
  tar = tarfile.open(path_to_pictures + '/part3.tar')
  tar.extractall(path=path_to_pictures)
  tar.close()
except:
  print("Pictures could not be extracted for part 3")


In [48]:
# Move pictures
import shutil
for source_folder in ('../pics/orginal_pics/part1/', '../pics/orginal_pics/part2/','../pics/orginal_pics/part3/' ):
    for file_name in os.listdir(source_folder):
        # construct full file path
        source = source_folder + file_name
        destination = '../pics/orginal_pics/' + file_name
        if os.path.isfile(source):
            shutil.move(source, destination)

# Upload Pictures

In [49]:
path = os.path.realpath(path_to_pictures)
# If the given path is a directory, open the directory and store the pictures in  a list
picture_list = []
if os.path.isdir(path):
    for file in os.listdir(path):
        if file.endswith(".jpg"):
            picture_list.append(file)

In [50]:
picture_list[0:5]

['100_0_0_20170112213500903.jpg',
 '100_0_0_20170112215240346.jpg',
 '100_1_0_20170110183726390.jpg',
 '100_1_0_20170112213001988.jpg',
 '100_1_0_20170112213303693.jpg']

# Create a Dataframe

The labels of each face image is embedded in the file name, formated like ```
[age]_[gender]_[race]_[date&time].jpg```  where:

* ```age``` is an integer from 0 to 116, indicating the age
* ```gender``` is either 0 (male) or 1 (female)
* ```race``` is an integer from 0 to 4, denoting White, Black, Asian, Indian, and Others (like Hispanic, Latino, Middle Eastern).
* ```date&time``` is in the format of yyyymmddHHMMSSFFF, showing the date and time an image was collected to UTKFace

We collect this data and store it in a dataframe.

In [51]:
# Each picture name looks like: age_gender_ethnicity_id.jpg
print(picture_list[0])

100_0_0_20170112213500903.jpg


In [52]:
# We applied split() to get an array with the labels for the picture.
temp = picture_list[0].split('_')
print(temp)

['100', '0', '0', '20170112213500903.jpg']


In [53]:
age_labels = []
gender_labels = []
ethnicity_labels = []

for file in picture_list:
    temp = file.split("_")
    age_labels.append(temp[0])
    gender_labels.append(temp[1])
    ethnicity_labels.append(temp[2])


Convert to a dataframe

In [54]:
df = pd.DataFrame()
df["filename"] = picture_list
df["age"] = age_labels
df["gender"] = gender_labels
df["ethnicity"] = ethnicity_labels

In [55]:
# map labels
gender_dict = {0: 'Male', 1: 'Female'}
ethnicity_dict = {}

# Data inspection and Data cleaning

## Data shape

In [56]:
df.shape

(24106, 4)

In [57]:
display(df)

Unnamed: 0,filename,age,gender,ethnicity
0,100_0_0_20170112213500903.jpg,100,0,0
1,100_0_0_20170112215240346.jpg,100,0,0
2,100_1_0_20170110183726390.jpg,100,1,0
3,100_1_0_20170112213001988.jpg,100,1,0
4,100_1_0_20170112213303693.jpg,100,1,0
...,...,...,...,...
24101,9_1_3_20161220222856346.jpg,9,1,3
24102,9_1_3_20170104222949455.jpg,9,1,3
24103,9_1_4_20170103200637399.jpg,9,1,4
24104,9_1_4_20170103200814791.jpg,9,1,4


## Check data types and proper spelling / labels

In [58]:
for column in df:
  unique_values = df[column].nunique()
  print(f"\nColumn {column}, Data type: {df[column].dtype}")
  print(f"Unique values: {unique_values} ")
  print(df[column].value_counts())


Column filename, Data type: object
Unique values: 24106 
45_0_0_20170117171151786.jpg    1
42_1_0_20170116221807908.jpg    1
1_1_4_20161221192830036.jpg     1
17_0_4_20170103210008641.jpg    1
43_1_0_20170117155110305.jpg    1
                               ..
32_1_2_20170116190147549.jpg    1
53_0_0_20170111201139891.jpg    1
37_1_1_20170112220140249.jpg    1
1_0_3_20161219230534953.jpg     1
38_0_2_20170117120432535.jpg    1
Name: filename, Length: 24106, dtype: int64

Column age, Data type: object
Unique values: 104 
26     2206
1      1282
28      921
35      881
24      861
       ... 
115       3
101       2
91        2
103       1
111       1
Name: age, Length: 104, dtype: int64

Column gender, Data type: object
Unique values: 4 
0    12581
1    11523
         1
3        1
Name: gender, dtype: int64

Column ethnicity, Data type: object
Unique values: 8 
0                        10222
1                         4558
3                         4027
2                         3586
4 

We investigate that the data is correctly written as follows: Each of these labels (age, gender and ethnicity) are numbers, therefore it should be possible to transform or cast these labels from string to integer. If this is not possible, it may be because they are not written correctly. To correct the name of these images we have written the function ```correct_picture ```.

In [59]:
def correct_picture(column_name, row_index, correct_age, correct_gender, correct_ethnicity):
    temp = df[column_name][row_index].split('_')
    new_filename = '{}_{}_{}_{}'.format(correct_age, correct_gender, correct_ethnicity, temp[-1])

    # rename file in the system
    old_name = f'{path_to_pictures}/{df["filename"][row_index]}'
    new_name = f'{path_to_pictures}/{new_filename}'
    os.rename(old_name, new_name)

    # rename file in dataset
    df["filename"][row_index] = new_filename
    df["age"][row_index] = correct_age
    df["gender"][row_index] = correct_gender
    df["ethnicity"][row_index] = correct_ethnicity

In [60]:
for column in df:
    if column != "filename":
        try: # Find out which columns have problems
            df[column].astype('float64')
        except ValueError as err:
            print(f'\nColumn " {column} " could not be converted: {err!s}')
            # Find out which rows have problems
            for row_index, value in enumerate(df[column]):
                if value is not None:
                    try:
                      int(value)
                    except ValueError as err:
                      print(f'Row {row_index} in column "{column}" with file name "{df["filename"][row_index]}" could not be converted: {err!s}')
                      if (df["filename"][row_index] == "61_1_20170109142408075.jpg" or df["filename"][row_index] == "61_3_20170109150557335.jpg" or df["filename"][row_index] == "39_1_20170116174525125.jpg" or df["filename"][row_index] == "53__0_20170116184028385.jpg"):
                        temp = df["filename"][row_index].split('_')
                        correct_picture(column, row_index, temp[0], 1, temp[1])
                      elif(df["filename"][row_index] == "53_1__.jpg" ):
                        df = df.drop(index=row_index)


Column " gender " could not be converted: could not convert string to float: ''
Row 19144 in column "gender" with file name "53__0_20170116184028385.jpg" could not be converted: invalid literal for int() with base 10: ''

Column " ethnicity " could not be converted: could not convert string to float: '20170116174525125.jpg'
Row 15279 in column "ethnicity" with file name "39_1_20170116174525125.jpg" could not be converted: invalid literal for int() with base 10: '20170116174525125.jpg'
Row 19144 in column "ethnicity" with file name "53_1__" could not be converted: invalid literal for int() with base 10: ''
Row 21119 in column "ethnicity" with file name "61_1_20170109142408075.jpg" could not be converted: invalid literal for int() with base 10: '20170109142408075.jpg'
Row 21131 in column "ethnicity" with file name "61_3_20170109150557335.jpg" could not be converted: invalid literal for int() with base 10: '20170109150557335.jpg'


Filename should have the pattern: age_gender_ethnicity_id.jpg

We found the following incidences: 
* Picture 39_1_20170116174525125.jpg is a black woman. Picture will be renamed as 39_1_1_20170116174525125.jpg
* Picture 53__0_20170116184028385.jpg is a white woman. Picture will be renamed as 53_1_0_20170116184028385.jpg
* Picture 61_1_20170109142408075.jpg is a black woman. Picture will be renamed to 61_1_1_20170109142408075.jpg
* Picture 61_3_20170109150557335.jpg is an indian woman. Picture will be renamed to 61_1_3_20170109150557335.jpg

In [61]:
# Check that corrections have been implemented
for column in df:
  unique_values = df[column].nunique()
  print(f"\nColumn {column}, Data type: {df[column].dtype}")
  print(f"Unique values: {unique_values} ")
  print(df[column].value_counts())


Column filename, Data type: object
Unique values: 24106 
45_0_0_20170117171151786.jpg    1
70_0_1_20170120225116936.jpg    1
17_0_4_20170103210008641.jpg    1
43_1_0_20170117155110305.jpg    1
35_0_4_20170117153235020.jpg    1
                               ..
53_0_0_20170111201139891.jpg    1
37_1_1_20170112220140249.jpg    1
1_0_3_20161219230534953.jpg     1
45_0_3_20170117183557423.jpg    1
38_0_2_20170117120432535.jpg    1
Name: filename, Length: 24106, dtype: int64

Column age, Data type: object
Unique values: 104 
26     2206
1      1282
28      921
35      881
24      861
       ... 
115       3
101       2
91        2
103       1
111       1
Name: age, Length: 104, dtype: int64

Column gender, Data type: object
Unique values: 3 
0    12581
1    11521
1        4
Name: gender, dtype: int64

Column ethnicity, Data type: object
Unique values: 6 
0    10221
1     4560
3     4028
2     3586
4     1710
         1
Name: ethnicity, dtype: int64


## Analyze NaN values

In [62]:
# Check for the presence of NaN values in percentage
percentageNAN = round((((df.isnull().sum()).sum() / np.product(df.shape)) * 100), 2)
print(f"There are {percentageNAN}% of NaN values.")

There are 0.0% of NaN values.


In [63]:
# Number of missing values per column
print(df.isnull().sum())

filename     0
age          0
gender       0
ethnicity    0
dtype: int64


## Check for double values

In [64]:
df.duplicated().any()

False

# Store dataframe in .csv file

In [65]:
df.to_csv('../datasets/dataset_faces.csv', index=False)

In [66]:
# Save all pictures in a .tar file
import tarfile
path = path_to_pictures + 'all_pictures.tar'
tar = tarfile.open(path, "w")

from tqdm import tqdm
for name in  tqdm(df['filename']):
  new_path = path_to_pictures + '/' + name
  tar.add(new_path)
tar.close()

100%|██████████| 24106/24106 [03:09<00:00, 126.96it/s]
