In [1]:
import pandas as pd
import numpy as np
import cv2    
import matplotlib.pyplot as plt
import seaborn as sns

from keras.applications.inception_v3 import InceptionV3, preprocess_input
from keras import optimizers
from keras.models import Sequential, Model 
from keras.layers import Dropout, Flatten, Dense, GlobalAveragePooling2D
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.image import ImageDataGenerator
from keras.utils import np_utils, load_img
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.applications import vgg16

from IPython.display import Image
import tensorflow as tf

In [2]:
main_folder = 'input/celeba-dataset/'
images_folder = main_folder + 'img_align_celeba/'
weights_folder = 'weights/'
EXAMPLE_PIC = images_folder + '000506.jpg'

TRAINING_SAMPLES = 10000
VALIDATION_SAMPLES = 2000
TEST_SAMPLES = 2000
IMG_WIDTH = 224
IMG_HEIGHT = 224
BATCH_SIZE = 16
NUM_EPOCHS = 20

## ATTRIBUTE DF

In [3]:
# import the data set that include the attribute for each picture
df_attr = pd.read_csv(main_folder + 'list_attr_celeba.csv')
df_attr.columns

Index(['file_name', '5_o_Clock_Shadow', 'Arched_Eyebrows', 'Attractive',
       'Bags_Under_Eyes', 'Bald', 'Bangs', 'Big_Lips', 'Big_Nose',
       'Black_Hair', 'Blond_Hair', 'Blurry', 'Brown_Hair', 'Bushy_Eyebrows',
       'Chubby', 'Double_Chin', 'Eyeglasses', 'Goatee', 'Gray_Hair',
       'Heavy_Makeup', 'High_Cheekbones', 'Male', 'Mouth_Slightly_Open',
       'Mustache', 'Narrow_Eyes', 'No_Beard', 'Oval_Face', 'Pale_Skin',
       'Pointy_Nose', 'Receding_Hairline', 'Rosy_Cheeks', 'Sideburns',
       'Smiling', 'Straight_Hair', 'Wavy_Hair', 'Wearing_Earrings',
       'Wearing_Hat', 'Wearing_Lipstick', 'Wearing_Necklace',
       'Wearing_Necktie', 'Young'],
      dtype='object')

In [4]:
df_attr.set_index('file_name', inplace=True)
df_attr.replace(to_replace=-1, value=0, inplace=True) #replace -1 by 0
df_attr.shape

(202599, 40)

## PARTITION DF (MALE, YOUNG, ATTRACTIVE)

In [5]:
df_partition = pd.read_csv(main_folder + 'list_eval_partition.csv')
df_partition.head()
df_partition['val'].value_counts().sort_index()

# join the partition with the attributes
df_partition.set_index('file_name', inplace=True)
df_par_attr = df_partition.join(df_attr['Male'], how='inner')
df_par_attr = df_par_attr.join(df_attr['Young'], how='inner')
df_par_attr = df_par_attr.join(df_attr['Attractive'], how='inner')
df_par_attr.head()

Unnamed: 0_level_0,val,Male,Young,Attractive
file_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
000001.jpg,0,0,1,1
000002.jpg,0,0,1,0
000003.jpg,0,1,1,0
000004.jpg,0,0,1,1
000005.jpg,0,0,1,1


## Generate Train Dataset

In [6]:
df_train = df_par_attr[
    (df_par_attr['val'] == 0) & 
    (df_par_attr['Male'] == 0) & 
    (df_par_attr['Young'] == 0) 
].sample(TRAINING_SAMPLES//2)

df_train = pd.concat([
    df_train,
    df_par_attr[(df_par_attr['val'] == 0) & (df_par_attr['Male'] == 1)].sample(TRAINING_SAMPLES//2)
])

df_train.reset_index(inplace=True)
df_train

Unnamed: 0,file_name,val,Male,Young,Attractive
0,147047.jpg,0,0,0,1
1,046424.jpg,0,0,0,0
2,157047.jpg,0,0,0,0
3,066534.jpg,0,0,0,0
4,033701.jpg,0,0,0,1
...,...,...,...,...,...
9995,159432.jpg,0,1,0,0
9996,083940.jpg,0,1,0,0
9997,060290.jpg,0,1,1,0
9998,136597.jpg,0,1,0,0


## Load images from folder

In [7]:
from keras.preprocessing.image import ImageDataGenerator

column = ['Male', 'Young', 'Attractive']

datagen = ImageDataGenerator(rescale=1./255)

train_generator = datagen.flow_from_dataframe(
    dataframe=df_train, 
    directory=images_folder, 
    x_col='file_name', 
    y_col=column, 
    class_mode="raw", 
    target_size=(IMG_HEIGHT, IMG_WIDTH), 
    batch_size=BATCH_SIZE,
    shuffle=True
)

Found 10000 validated image filenames.


## Generate Validation Dataset

In [8]:
df_val = df_par_attr[(df_par_attr['val'] == 1) & (df_par_attr['Male'] == 0)].sample(VALIDATION_SAMPLES//2)
df_val = pd.concat([
    df_val,
    df_par_attr[(df_par_attr['val'] == 1) & (df_par_attr['Male'] == 1)].sample(VALIDATION_SAMPLES//2)
])

df_val.reset_index(inplace=True)
df_val['Male']

0       0
1       0
2       0
3       0
4       0
       ..
1995    1
1996    1
1997    1
1998    1
1999    1
Name: Male, Length: 2000, dtype: int64

In [9]:
val_generator = datagen.flow_from_dataframe(
    dataframe=df_val, 
    directory=images_folder, 
    x_col='file_name', 
    y_col=column, 
    class_mode="raw", 
    target_size=(IMG_HEIGHT, IMG_WIDTH), 
    batch_size=BATCH_SIZE,
    shuffle=True
)

Found 2000 validated image filenames.


## Generate Test Dataset

In [10]:
df_test = df_par_attr[(df_par_attr['val'] == 2) & (df_par_attr['Male'] == 0)].sample(TEST_SAMPLES//2)
df_test = pd.concat([
    df_test,
    df_par_attr[(df_par_attr['val'] == 2) & (df_par_attr['Male'] == 1)].sample(TEST_SAMPLES//2)
])

df_test.reset_index(inplace=True)
df_test

Unnamed: 0,file_name,val,Male,Young,Attractive
0,183892.jpg,2,0,1,1
1,185438.jpg,2,0,1,1
2,189382.jpg,2,0,0,1
3,195089.jpg,2,0,1,1
4,200868.jpg,2,0,1,1
...,...,...,...,...,...
1995,199588.jpg,2,1,0,0
1996,190567.jpg,2,1,0,0
1997,183845.jpg,2,1,0,0
1998,193323.jpg,2,1,0,0


In [11]:
test_generator = datagen.flow_from_dataframe(
    dataframe=df_test, 
    directory=images_folder, 
    x_col='file_name', 
    y_col=column, 
    class_mode='raw', 
    target_size=(IMG_HEIGHT, IMG_WIDTH), 
    batch_size=BATCH_SIZE,
    shuffle=False
)

Found 2000 validated image filenames.


In [12]:
# df_train.plot(x="file_name", y=column, kind="bar", figsize=(9, 8))
# plt.show()
col1 = df_train.value_counts('Male')
col2 = df_train.value_counts('Young')
col3 = df_train.value_counts('Attractive')
distri = {
    'gender': col1,
    'age': col2,
    'attractiveness': col3
}
distri = pd.DataFrame(distri)

## Model creation (multi-label classification)

In [15]:
def generator_wrapper(generator):
    for batch_x,batch_y in generator:
        yield (batch_x,[batch_y[:,i] for i in range(3)])

# VGG 16 as base
base_model = vgg16.VGG16(weights='imagenet', include_top=False, input_shape=(IMG_HEIGHT, IMG_WIDTH, 3))
base_model.trainable = False

x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dense(1024, activation="relu")(x)
x = Dropout(0.5)(x)
x = Dense(512, activation="relu")(x)
gender_out = Dense(1, activation = 'sigmoid')(x)
young_out = Dense(1, activation = 'sigmoid')(x)
attractive_out = Dense(1, activation = 'sigmoid')(x)

model = Model(inputs=base_model.input, outputs=[gender_out, young_out, attractive_out])
model.compile(optimizer=Adam(learning_rate=1e-3), 
              loss=["binary_crossentropy", "binary_crossentropy", "binary_crossentropy"],
              metrics=['accuracy'])

es = EarlyStopping(monitor='val_accuracy', patience=4, restore_best_weights=True)
        
print(val_generator.dtype)
model.fit(
    generator_wrapper(train_generator),
    steps_per_epoch=TRAINING_SAMPLES//BATCH_SIZE,
    validation_data=generator_wrapper(val_generator),
    validation_steps=val_generator.n//val_generator.batch_size,
    callbacks=[es],
    epochs=NUM_EPOCHS,
    verbose=1
)

model.save('weights/gender_age')

float32
Epoch 1/20
 53/625 [=>............................] - ETA: 29s - loss: 1.8730 - dense_12_loss: 0.6133 - dense_13_loss: 0.6653 - dense_14_loss: 0.5944 - dense_12_accuracy: 0.6521 - dense_13_accuracy: 0.6462 - dense_14_accuracy: 0.7217

KeyboardInterrupt: 

## Prediction on test set

In [None]:
STEP_SIZE_TEST = test_generator.n//test_generator.batch_size

test_generator.reset()
pred = model.predict(
    test_generator,
    steps=STEP_SIZE_TEST,
    verbose=1
)

In [None]:
pred_df = pd.DataFrame(pred[0], columns=['gender'])
pred_df['young'] = pred[1]
pred_df['attractive'] = pred[2]
pred_df

In [None]:
img_file = df_test._get_value(4, 'file_name')
img_file = images_folder + img_file
img_file
img = load_img(img_file)
img

## Custom test set

In [None]:
custom_folder = 'test/'

numpy_array = np.array(
    [['alvin.jpg', '1', '1', '1'],
     ['alvins_chai.jpg', '0', '1', '1'],
     ['iu.jpg', '0', '1', '1'],
     ['jean.jpg', '0', '1', '1'],
     ['jk.jpg', '1', '1', '1'],
     ['jolene.jpg', '0', '1', '1'],
     ['ky.jpg', '1', '1', '0'],
     ['pale.jpg', '0', '1', '1'],
     ['pale2.jpg', '0', '1', '1'],
     ['pstar.jpg', '0', '1', '1'],
     ['yx.jpg', '1', '1', '1']
    ]
)
df_test_2 = pd.DataFrame(numpy_array, columns=['file_name', 'Male', 'Young', 'Attractive'])
# df_test_2.reset_index(inplace=True)

test2_generator = datagen.flow_from_dataframe(
    dataframe=df_test_2, 
    directory=custom_folder, 
    x_col='file_name', 
    y_col=column, 
    class_mode='raw',  
    target_size=(IMG_HEIGHT, IMG_WIDTH), 
    shuffle=False
)

In [None]:
test2_generator.reset()

pred_custom = model.predict(
    test2_generator,
    verbose=1
)

In [None]:
pred_custom

In [None]:
fig = plt.figure(figsize=(30, 20))

for ind, val in enumerate(numpy_array):
    fig.add_subplot(1, len(numpy_array), ind+1)
    plt.imshow(load_img(custom_folder + val[0]))
    plt.axis('off')
    if pred_custom[0][ind] > 0.5:
        gender = ' Male'
    else:
        gender = ' Female'
    if pred_custom[1][ind] > 0.5:
        young = 'young'
    else:
        young = 'old'
    if pred_custom[2][ind] > 0.5:
        attractive = 'hot'
    else:
        attractive = 'ugz'
    plt.title("{}, {}, {}".format(gender, young, attractive))  