In [1]:
import numpy as np
import pandas as pd
import os
import random
from sklearn.utils import shuffle

In [2]:
np.random.seed(42)
random.seed(42)

In [3]:
os.chdir("/home/mate/develop/PycharmProjects/GeFace/")
try:
    csv_file = pd.read_csv("data/imdb.csv", delimiter=';', encoding="ISO-8859-1", engine='python')
    pd.set_option('display.max_columns', 100)
except (FileNotFoundError):
    print("CSV file not found")
    current_path = os.getcwd()
    print("Current path is " + current_path)

In [4]:
csv_file.head()

Unnamed: 0,dob,photo_taken,full_path,gender,name,face_location,face_score,second_face_score,celeb_id
0,1899,1968,01/nm0000001_rm124825600_1899-5-10_1968.jpg,1.0,Fred Astaire,[1072.926 161.838 1214.784 303.696],1.459693,1.118973,6488
1,1899,1970,01/nm0000001_rm3343756032_1899-5-10_1970.jpg,1.0,Fred Astaire,[477.184 100.352 622.592 245.76],2.543198,1.852008,6488
2,1899,1968,01/nm0000001_rm577153792_1899-5-10_1968.jpg,1.0,Fred Astaire,[114.969643089629 114.969643089629 451.6865723...,3.455579,2.98566,6488
3,1899,1968,01/nm0000001_rm946909184_1899-5-10_1968.jpg,1.0,Fred Astaire,[622.885505642659 424.217503837008 844.3390076...,1.872117,,6488
4,1899,1968,01/nm0000001_rm980463616_1899-5-10_1968.jpg,1.0,Fred Astaire,[1013.85900236037 233.882042207585 1201.586127...,1.158766,,6488


In [5]:
# drop unneccessary columns
df = csv_file.drop(columns=['name', 'face_location','face_score','second_face_score','celeb_id'])

In [6]:
# calculate ages
df['dob'] = df['photo_taken'] - df['dob']

In [7]:
df = df.drop(columns=['photo_taken'])
df = df.rename(index=str, columns={"dob": "age"})

In [8]:
df = df.query('age >= 0 and age <=100')

In [9]:
df.head()

Unnamed: 0,age,full_path,gender
0,69,01/nm0000001_rm124825600_1899-5-10_1968.jpg,1.0
1,71,01/nm0000001_rm3343756032_1899-5-10_1970.jpg,1.0
2,69,01/nm0000001_rm577153792_1899-5-10_1968.jpg,1.0
3,69,01/nm0000001_rm946909184_1899-5-10_1968.jpg,1.0
4,69,01/nm0000001_rm980463616_1899-5-10_1968.jpg,1.0


In [10]:
# shuffle the rows
df = shuffle(df)

In [11]:
df.head()

Unnamed: 0,age,full_path,gender
396541,22,41/nm2374841_rm226474752_1987-3-19_2009.jpg,0.0
177124,37,27/nm0005527_rm1698345984_1972-7-10_2009.jpg,0.0
424187,24,53/nm0488953_rm3146883072_1989-10-1_2013.jpg,0.0
250688,16,27/nm1227027_rm3763247360_1987-8-8_2003.jpg,1.0
187962,34,57/nm0206257_rm2710020352_1979-5-9_2013.jpg,0.0


## Data splitting

Split the dataset into train, validation and test dataset. Rule is : 70-20-10 

In [12]:
df.shape

(460070, 3)

In [15]:
# calculate test train valid data numbers
test_num = int(np.floor(0.1 * df.shape[0]))
valid_num = int(np.floor(0.2 * df.shape[0]))
train_num = int(df.shape[0] - test_num - valid_num)
print("train: {} | valid: {} | test: {}".format(train_num, valid_num, test_num))

train: 322049 | valid: 92014 | test: 46007


In [34]:
# split the data into train valid and test data
train_data = df.iloc[0:train_num,:]
train_data.head()
train_data.shape

(322049, 3)

In [35]:
valid_data = df.iloc[train_num:train_num + valid_num, :]
valid_data.head()
valid_data.shape

(92014, 3)

In [36]:
test_data = df.iloc[ train_num+valid_num:,:]
test_data.head()
test_data.shape

(46007, 3)

In [50]:
train = []
for i in range(101):
     train.append(train_data.query('age == {}'.format(i)))
print(train)

[        age                                     full_path  gender
295275    0  77/nm0837177_rm1609673728_1975-9-18_1975.jpg     1.0
365156    0   24/nm0266824_rm821681408_1994-2-23_1994.jpg     0.0
295262    0  77/nm0837177_rm1282260224_1975-9-18_1975.jpg     1.0
396021    0  75/nm1377375_rm2251480064_1981-8-25_1981.jpg     0.0
295333    0  77/nm0837177_rm2483204352_1975-9-18_1975.jpg     1.0
409229    0  22/nm1973422_rm811972864_1988-11-20_1988.jpg     1.0
295432    0   77/nm0837177_rm693221632_1975-9-18_1975.jpg     1.0
396023    0  75/nm1377375_rm2268257280_1981-8-25_1981.jpg     0.0
295257    0  77/nm0837177_rm1132961792_1975-9-18_1975.jpg     1.0
221917    0   44/nm4314944_rm976596224_2010-2-12_2010.jpg     0.0
295255    0  77/nm0837177_rm1099407360_1975-9-18_1975.jpg     1.0
295429    0   77/nm0837177_rm642889984_1975-9-18_1975.jpg     1.0
295396    0  77/nm0837177_rm4072781312_1975-9-18_1975.jpg     1.0
295340    0  77/nm0837177_rm2738992640_1975-9-18_1975.jpg     1.0
295267   

In [51]:
valid = []
for i in range(101):
     valid.append(valid_data.query('age == {}'.format(i)))
print(valid)

[        age                                     full_path  gender
295273    0  77/nm0837177_rm1592896512_1975-9-18_1975.jpg     1.0
396019    0  75/nm1377375_rm2234702848_1981-8-25_1981.jpg     0.0
221914    0  44/nm4314944_rm3492392448_2010-2-12_2010.jpg     0.0
295412    0   77/nm0837177_rm432904448_1975-9-18_1975.jpg     1.0
295336    0   77/nm0837177_rm257277184_1975-9-18_1975.jpg     1.0
295305    0  77/nm0837177_rm1918735360_1975-9-18_1975.jpg     1.0
229524    0  49/nm0614249_rm3025372160_1964-6-17_1964.jpg     0.0
295268    0  77/nm0837177_rm1472707072_1975-9-18_1975.jpg     1.0
295321    0  77/nm0837177_rm2174401280_1975-9-18_1975.jpg     1.0
221911    0  44/nm4314944_rm1423879680_2010-2-12_2010.jpg     0.0
221915    0  44/nm4314944_rm3593055744_2010-2-12_2010.jpg     0.0,         age                                      full_path  gender
35774     1    33/nm0000233_rm394710784_1963-3-27_1964.jpg     1.0
390074    1  79/nm0272479_rm3291919104_1975-10-22_1976.jpg     1.0
32451

In [None]:
test = []
for i in range(101):
     test.append(test_data.query('age == {}'.format(i)))
print(test)