In [1]:
import scipy.io
import numpy as np
import pandas as pd

## Load the IMDB-WIKI dataset

Get the image metadata here: https://data.vision.ee.ethz.ch/cvl/rrothe/imdb-wiki/

Get the pairwise comparison data here: https://github.com/Toloka/IMDB-WIKI-SbS

In [49]:
# Load MATLAB files
wiki = scipy.io.loadmat('./data/imdb-wiki/wiki.mat')
imdb = scipy.io.loadmat('./data/imdb-wiki/imdb.mat')
wiki

{'__header__': b'MATLAB 5.0 MAT-file, Platform: GLNXA64, Created on: Sat Jan 16 16:25:20 2016',
 '__version__': '1.0',
 '__globals__': [],
 'wiki': array([[(array([[723671, 703186, 711677, ..., 720620, 723893, 713846]], dtype=int32), array([[2009, 1964, 2008, ..., 2013, 2011, 2008]], dtype=uint16), array([[array(['17/10000217_1981-05-05_2009.jpg'], dtype='<U31'),
                 array(['48/10000548_1925-04-04_1964.jpg'], dtype='<U31'),
                 array(['12/100012_1948-07-03_2008.jpg'], dtype='<U29'), ...,
                 array(['09/9998109_1972-12-27_2013.jpg'], dtype='<U30'),
                 array(['00/9999400_1981-12-13_2011.jpg'], dtype='<U30'),
                 array(['80/999980_1954-06-11_2008.jpg'], dtype='<U29')]],
               dtype=object), array([[1., 1., 1., ..., 1., 1., 0.]]), array([[array(['Sami Jauhojärvi'], dtype='<U15'),
                 array(['Dettmar Cramer'], dtype='<U14'),
                 array(['Marc Okrand'], dtype='<U11'), ...,
                 arr

In [28]:
# Extract file paths
full_path_wiki = wiki['wiki']['full_path'][0][0][0]
full_path_imdb = imdb['imdb']['full_path'][0][0][0]
full_path_wiki

array([array(['17/10000217_1981-05-05_2009.jpg'], dtype='<U31'),
       array(['48/10000548_1925-04-04_1964.jpg'], dtype='<U31'),
       array(['12/100012_1948-07-03_2008.jpg'], dtype='<U29'), ...,
       array(['09/9998109_1972-12-27_2013.jpg'], dtype='<U30'),
       array(['00/9999400_1981-12-13_2011.jpg'], dtype='<U30'),
       array(['80/999980_1954-06-11_2008.jpg'], dtype='<U29')],
      dtype=object)

In [29]:
# Only keep the file names
filename_wiki = np.array(list(map(lambda x:x.split('/')[1], np.stack(full_path_wiki).flatten())))
filename_imdb = np.array(list(map(lambda x:x.split('/')[1], np.stack(full_path_imdb).flatten())))
filename_wiki

array(['10000217_1981-05-05_2009.jpg', '10000548_1925-04-04_1964.jpg',
       '100012_1948-07-03_2008.jpg', ..., '9998109_1972-12-27_2013.jpg',
       '9999400_1981-12-13_2011.jpg', '999980_1954-06-11_2008.jpg'],
      dtype='<U34')

In [30]:
# Get the genders
gender_wiki = wiki['wiki']['gender'][0][0][0]
gender_imdb = imdb['imdb']['gender'][0][0][0]
gender_imdb

array([1., 1., 1., ..., 0., 0., 0.])

In [41]:
# Convert to DataFrames and concat
imdb_df = pd.DataFrame({'filename': filename_imdb, 'gender': gender_imdb})
wiki_df = pd.DataFrame({'filename': filename_wiki, 'gender': gender_wiki})
gender_df = pd.concat([imdb_df, wiki_df]).reset_index(drop=True)
gender_df

Unnamed: 0,filename,gender
0,nm0000001_rm124825600_1899-5-10_1968.jpg,1.0
1,nm0000001_rm3343756032_1899-5-10_1970.jpg,1.0
2,nm0000001_rm577153792_1899-5-10_1968.jpg,1.0
3,nm0000001_rm946909184_1899-5-10_1968.jpg,1.0
4,nm0000001_rm980463616_1899-5-10_1968.jpg,1.0
...,...,...
523046,9996949_1937-04-17_1963.jpg,1.0
523047,9997032_1947-07-30_1970.jpg,1.0
523048,9998109_1972-12-27_2013.jpg,1.0
523049,9999400_1981-12-13_2011.jpg,1.0


In [50]:
# Get the ground-truth ages
gt_df = pd.read_csv('./data/imdb-wiki/gt.csv')
gt_df['filename'] = gt_df['label'].apply(lambda x: x.split('/')[-1])
gt_df = gt_df.rename({'score': 'age'}, axis=1)
gt_df

Unnamed: 0,label,age,filename
0,https://tlk.s3.yandex.net/annotation_tasks/IMD...,10,nm1442940_rm3965098752_1996-10-3_2006.jpg
1,https://tlk.s3.yandex.net/annotation_tasks/IMD...,10,nm4832920_rm1781768448_2003-8-28_2013.jpg
2,https://tlk.s3.yandex.net/annotation_tasks/IMD...,10,nm0652089_rm860657920_1992-3-10_2002.jpg
3,https://tlk.s3.yandex.net/annotation_tasks/IMD...,10,nm0004917_rm1493730304_1969-5-12_1979.jpg
4,https://tlk.s3.yandex.net/annotation_tasks/IMD...,10,nm1113550_rm1332711936_1996-4-14_2006.jpg
...,...,...,...
9145,https://tlk.s3.yandex.net/annotation_tasks/IMD...,70,475367_1941-08-03_2011.jpg
9146,https://tlk.s3.yandex.net/annotation_tasks/IMD...,70,304085_1919-07-07_1989.jpg
9147,https://tlk.s3.yandex.net/annotation_tasks/IMD...,70,nm0001627_rm4164078592_1927-2-20_1997.jpg
9148,https://tlk.s3.yandex.net/annotation_tasks/IMD...,70,nm0000024_rm1715129344_1904-4-14_1974.jpg


In [42]:
# merge ground-truth on on 'filename' column
df = pd.merge(gt_df[['age', 'filename']], gender_df[['filename', 'gender']], on='filename', how='left')
df

Unnamed: 0,age,filename,gender
0,10,nm1442940_rm3965098752_1996-10-3_2006.jpg,0.0
1,10,nm4832920_rm1781768448_2003-8-28_2013.jpg,0.0
2,10,nm0652089_rm860657920_1992-3-10_2002.jpg,0.0
3,10,nm0004917_rm1493730304_1969-5-12_1979.jpg,0.0
4,10,nm1113550_rm1332711936_1996-4-14_2006.jpg,0.0
...,...,...,...
9145,70,475367_1941-08-03_2011.jpg,1.0
9146,70,304085_1919-07-07_1989.jpg,1.0
9147,70,nm0001627_rm4164078592_1927-2-20_1997.jpg,1.0
9148,70,nm0000024_rm1715129344_1904-4-14_1974.jpg,1.0


In [51]:
# Import the pairwise comparisons
pairwise_df = pd.read_csv('./data/imdb-wiki/crowd_labels.csv')
pairwise_df['left']  = pairwise_df['left'].apply(lambda x: x.split('/')[-1])
pairwise_df['right'] = pairwise_df['right'].apply(lambda x: x.split('/')[-1])
pairwise_df['label'] = pairwise_df['label'].apply(lambda x: x.split('/')[-1])
pairwise_df

Unnamed: 0,left,right,label,performer
0,26147732_1991-07-22_2015.jpg,nm0707728_rm4048061440_1935-1-2_1966.jpg,nm0707728_rm4048061440_1935-1-2_1966.jpg,0
1,nm0000072_rm2497944320_1932-2-27_1985.jpg,nm0025978_rm3962736128_1954-11-16_1999.jpg,nm0000072_rm2497944320_1932-2-27_1985.jpg,0
2,nm0788370_rm2989144320_1971-4-11_1999.jpg,nm0726180_rm2487589632_1981-1-1_2005.jpg,nm0788370_rm2989144320_1971-4-11_1999.jpg,0
3,nm0325390_rm3409478912_1951-10-19_1979.jpg,1094676_1944-12-08_2011.jpg,1094676_1944-12-08_2011.jpg,1
4,1780768_1927-01-22_1963.jpg,nm0000442_rm1571854336_1944-1-23_2005.jpg,nm0000442_rm1571854336_1944-1-23_2005.jpg,1
...,...,...,...,...
250244,nm0000943_rm2478151680_1968-1-13_1999.jpg,nm0000825_rm1298385920_1952-10-7_1962.jpg,nm0000825_rm1298385920_1952-10-7_1962.jpg,3885
250245,26835279_1938-12-27_2004.jpg,nm0771493_rm519611392_1955-5-27_2013.jpg,26835279_1938-12-27_2004.jpg,3885
250246,2723_1964-05-27_2007.jpg,nm0858969_rm53517056_1990-1-30_2001.jpg,2723_1964-05-27_2007.jpg,4090
250247,nm0177396_rm3601370112_1987-9-15_2010.jpg,nm0005391_rm3942707712_1963-3-10_2015.jpg,nm0005391_rm3942707712_1963-3-10_2015.jpg,4090


In [45]:
# replace the values in df2 with their corresponding index in df1
pairwise_df['left']  = pairwise_df['left'].apply(lambda x:  df[df['filename']==x].index[0])
pairwise_df['right'] = pairwise_df['right'].apply(lambda x: df[df['filename']==x].index[0])
pairwise_df['label'] = pairwise_df['label'].apply(lambda x: df[df['filename']==x].index[0])
pairwise_df

Unnamed: 0,left,right,label,performer
0,2229,3164,3164,0
1,6505,5302,6505,0
2,2822,2139,2822,0
3,2757,8593,8593,1
4,3991,7792,7792,1
...,...,...,...,...
250244,3209,7,7,3885
250245,8533,7276,8533,3885
250246,5025,228,5025,4090
250247,2059,6425,6425,4090


In [48]:
pairwise_df.to_csv('./data/imdb-wiki/comparisons_cleaned.csv', index=False)
df.to_csv('./data/imdb-wiki/ground_truth_cleaned.csv', index=True)

## Subsampling

In [54]:
pairwise_df = pd.read_csv('./data/imdb-wiki/comparisons_cleaned.csv')
df = pd.read_csv('./data/imdb-wiki/ground_truth_cleaned.csv', index_col=0)