In [21]:
import os
from six.moves import cPickle as pickle
import numpy as np
import scipy.io as sio
import scipy.misc as spm
from scipy import ndimage
import datetime
import matplotlib.image as plt
from IPython.display import Image, display
from skimage.transform import resize

IMG_DIR = r'/home/ubuntu/coding/cnn/datasets/imdb_crop'
MAT_FILE = r'/home/ubuntu/coding/cnn/datasets/imdb_crop/imdb.mat'

img_depth = 3
img_size = 128

n_bytes = 2**31
max_bytes = 2**31 - 1

In [3]:
def reformat_date(mat_date):
    dt = datetime.date.fromordinal(np.max([mat_date - 366, 1])).year
    return dt

In [4]:
def create_path(path):
    return os.path.join(IMG_DIR, path[0])

In [5]:
mat_struct = sio.loadmat(MAT_FILE)
data_set = [data[0] for data in mat_struct['imdb'][0, 0]]

keys = ['dob',
    'photo_taken',
    'full_path',
    'gender',
    'name',
    'face_location',
    'face_score',
    'second_face_score',
    'celeb_names',
    'celeb_id'
]

imdb_dict = dict(zip(keys, np.asarray(data_set)))
imdb_dict['dob'] = [reformat_date(dob) for dob in imdb_dict['dob']]
imdb_dict['full_path'] = [create_path(path) for path in imdb_dict['full_path']]

# Add 'age' key to the dictionary
imdb_dict['age'] = imdb_dict['photo_taken'] - imdb_dict['dob']

print("Dictionary created...")

Dictionary created...


In [9]:
print("Converting {} samples. (0=all samples)".format(0))

raw_path = imdb_dict['full_path']
raw_age = imdb_dict['age']
raw_gender = imdb_dict['gender']
raw_sface = imdb_dict['second_face_score']

age = []
gender = []
imgs = []
for i, sface in enumerate(raw_sface):
    if i%200000==0:
        print("Processing {0} of {1}".format(i,len(raw_sface)))
#         display(Image(filename=raw_path[i]))
        print("Second face score: {}".format(sface), end=" ")
        print("Age: {}".format(raw_age[i]), end=" ")
        print("Gender: {}".format(raw_gender[i]))
    if np.isnan(sface) and raw_age[i] >= 0 and not np.isnan(raw_gender[i]):
        age.append(raw_age[i])
        gender.append(raw_gender[i])
        imgs.append(raw_path[i])

Converting 0 samples. (0=all samples)
Processing 0 of 460723
Second face score: 1.1189733571573068 Age: 69 Gender: 1.0
Processing 200000 of 460723
Second face score: nan Age: 55 Gender: 1.0
Processing 400000 of 460723
Second face score: nan Age: 27 Gender: 0.0


In [10]:
# Convert images path to images.

# only take a subset of dataset: first 100000 imgs
# dataset = np.ndarray(shape=(100000, img_size, img_size, img_depth), dtype=np.float32)

if os.path.exists(os.getcwd()+"/pkl_folder/imdb_data_train.pkl") and os.path.exists(
    os.getcwd()+"/pkl_folder/imdb_data_valid.pkl") and os.path.exists(
    os.getcwd()+"pkl_folder/imdb_data_test.pkl"):
    print("Dataset already present - Skip convert images to images.")
else:
    print("Converting images path to images.")
    real_imgs = []
    tmp = []
    for i, img_path in enumerate(imgs):
        if i==100000:
            break
        tmp = np.asarray(spm.imresize(spm.imread(img_path, mode='RGB'), (128, 128, 3)), dtype=np.float32)
        real_imgs.append(tmp)

    print("Original size: {0} - Preprocess size: {1}".format(len(raw_sface), len(real_imgs)))
    
#     print("Converting images path to images.")
#     for i, img_path in enumerate(imgs):
#         if i == 100000:
#             break
#         image_data = resize(((ndimage.imread(img_path).astype(float) - img_depth / 2) / img_depth), 
#                             (img_size, img_size, img_depth), mode='reflect')
#         dataset[i, :, :, :] = image_data

#     print("Original size: {0} - Preprocess size: {1}".format(len(raw_sface), len(dataset)))

Converting images path to images.
Original size: 460723 - Preprocess size: 100000


In [11]:
data_train = {'image_inputs': np.array(real_imgs[0:60000]),
                'age_labels': np.array(age[0:60000]),
                'gender_labels': np.array(gender[0:60000])
                }

In [17]:
bytes_out = pickle.dumps(data_train, pickle.HIGHEST_PROTOCOL)
with open(os.path.join(os.getcwd(),"pkl_folder/imdb_data_train.pkl"),'wb') as f_out:
    for idx in range(0, len(bytes_out), max_bytes):
        f_out.write(bytes_out[idx:idx+max_bytes])


In [None]:
bytes_in = bytearray(0)
input_size = os.path.getsize(os.path.join(os.getcwd(),"pkl_folder/imdb_data_train.pkl"))
with open(os.path.join(os.getcwd(),"pkl_folder/imdb_data_train.pkl"), 'rb') as f_in:
    for _ in range(0, input_size, max_bytes):
        bytes_in += f_in.read(max_bytes)
data2 = pickle.loads(bytes_in)

In [None]:
img_size = 128
img_depth = 3

def dump_data()

def create_pickle(force=False):
    data_train_path = os.getcwd()+"/pkl_folder/imdb_data_train.pkl"
    data_valid_path = os.getcwd()+"/pkl_folder/imdb_data_valid.pkl"
    data_test_path = os.getcwd()+"/pkl_folder/imdb_data_test.pkl"
    if os.path.exists(data_train_path) and os.path.exists(
        data_valid_path) and os.path.exists(
        data_test_path) and not force:
        # You may override by setting force=True.
        print("Dataset already present - Skip pickling.")
        
    else:
        # Dump data train
        print('Dump data train')
        data_train = {'image_inputs': np.array(real_imgs[0:60000]),
                'age_labels': np.array(age[0:60000]),
                'gender_labels': np.array(gender[0:60000])
                }
        print("Dataset train size: {}".format(len(data_train['image_inputs'])))
        bytes_out = pickle.dumps(data_train, pickle.HIGHEST_PROTOCOL)
        with open(os.path.join(os.getcwd(),"pkl_folder/imdb_data_train.pkl"),'wb') as f_out:
            for idx in range(0, len(bytes_out), max_bytes):
                f_out.write(bytes_out[idx:idx+max_bytes])
#         with open(os.path.join(os.getcwd(),"pkl_folder/imdb_data_train.pkl"),'wb') as f:
#             pickle.dump(data_train, f)
        print("Dataset train size: imdb_data_train.pkl")

        # Dump data valid
        print('Dump data valid')
        data_valid = {'image_inputs': np.array(real_imgs[60000:80000]),
                'age_labels': np.array(age[60000:80000]),
                'gender_labels': np.array(gender[60000:80000])
                }
        print("Dataset valid size: {}".format(len(data_valid['image_inputs'])))
        with open(os.path.join(os.getcwd(),"pkl_folder/imdb_data_valid.pkl"),'wb') as f:
            pickle.dump(data_valid, f)
        print("Dataset train size: imdb_data_valid.pkl")

        # Dump data test
        print('Dump data test')
#         img_inputs = np.ndarray((len(real_imgs[80000:100000]), img_size, img_size, img_depth),
#                                 buffer=real_imgs[80000:100000], dtype=np.float32)
#         img_inputs = real_imgs[80000:100000]
        data_test = {'image_inputs': np.array(real_imgs[80000:100000]),
                'age_labels': np.array(age[80000:100000]),
                'gender_labels': np.array(gender[80000:100000])
                }
        print("Dataset test size: {}".format(len(data_test['image_inputs'])))
        with open(os.path.join(os.getcwd(),"pkl_folder/imdb_data_test.pkl"),'wb') as f:
            pickle.dump(data_test, f)
        print("Dataset test size: imdb_data_test.pkl")
        
    return data_train_path, data_valid_path, data_test_path

data_train_path, data_valid_path, data_test_path = create_pickle(force=True)

In [None]:
def convert_label(pickle_file):
    try:
        with open(pickle_file, 'rb') as f:
            data_train = pickle.load(f)
            labels = np.ndarray(len(data_train['image_inputs']), dtype=np.int32)
            dataset = np.ndarray((len(data_train['image_inputs']), img_size, img_size, img_depth), dtype=np.float32)
            # let's shuffle to have random dataset
            np.random.shuffle(dataset)
            dataset = data_train['image_inputs']
            tmp_labels = []
            for i, age_label in enumerate(data_train['age_labels']):
                if age_label < 30:
                    age = 0
                elif age_label <= 45:
                    age = 1
                elif age_label < 60:
                    age = 2
                elif age_label >= 60:
                    age = 3
                else:
                    continue
                tmp = [age, data_train['gender_labels'][i]]
                tmp_labels.append(tmp)
            
            labels = [tmp_labels]
            return dataset, labels
            
    except Exception as e:
        print('Unable to process data from', pickle_file, ':', e)
        raise
    return dataset, label

train_dataset, train_labels = convert_label(data_train_path)

In [None]:
with open(data_train_path, 'rb') as f:
    data_train = pickle.load(f)

In [None]:
data_train

In [None]:
labels = np.array([data_train['age_labels'], data_train['gender_labels']])

In [None]:
labels

In [None]:
data_train_path

In [None]:
tmp123 = []
type(tmp123)


In [None]:
        img_inputs = np.ndarray((len(real_imgs[0:60000]), img_size, img_size, img_depth), dtype=np.float32)


In [None]:
print(real_imgs[0].shape)
print(img_inputs.shape)