In [1]:
from google.colab import drive
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


### Import Libraries

In [3]:
import pandas as pd
import numpy as np  
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from sklearn.metrics import f1_score

import cv2
import keras
from keras.applications.inception_v3 import InceptionV3, preprocess_input
from keras.models import Sequential, Model, load_model
from keras.layers import Dropout, Flatten, Dense, GlobalAveragePooling2D
from keras.optimizers import SGD
from keras.callbacks import ModelCheckpoint
from keras_preprocessing.image import ImageDataGenerator
from keras.utils import np_utils

### Dataset

The recommended partitioning of images into training, validation, testing of the data set is:

- 1-162770 are training
- 162771-182637 are validation
- 182638-202599 are testing

The partition is in file list_eval_partition.csv

Subset of dataset will be used due to time constraint:

- Training 20000 images
- Validation 5000 images
- Test 5000 Images

In [None]:
data_folder = '/content/gdrive/My Drive/Colab Notebooks/celeba-dataset/'
img_folder = '/content/gdrive/My Drive/Colab Notebooks/img_align_celeba/img_align_celeba/'

TRAINING_SAMPLES = 20000
VALIDATION_SAMPLES = 2000
TEST_SAMPLES = 2000
IMG_WIDTH = 178
IMG_HEIGHT = 218
BATCH_SIZE = 16
NUM_EPOCHS = 20

In [None]:
# recommended dataset partition by authors
df_partition = pd.read_csv(f"{data_folder}list_eval_partition.csv")
df_partition.head()

In [None]:
# display counter by partition
df_partition['partition'].value_counts().sort_index()

In [None]:
# join the partition on specified column
df_partition.set_index('image_id', inplace=True)
df_par_attr = df_partition.join(df_attr['Male'], how='inner')
df_par_attr.head()

### Generate Partitions

In [None]:
def load_reshape_img(fname):
    img = load_img(fname)
    x = img_to_array(img)/255.
    x = x.reshape((1,) + x.shape)
    return x

def generate_df(partition, attr, num_samples):
    """
    this function generates recommended partition for train, validation and test for CelebA dataset
    """

    df_ = df_par_attr[(df_par_attr['partition'] == partition) 
                           & (df_par_attr[attr] == 0)].sample(int(num_samples/2))
    df_ = pd.concat([df_,
                      df_par_attr[(df_par_attr['partition'] == partition) 
                                  & (df_par_attr[attr] == 1)].sample(int(num_samples/2))])

    # train - validation
    if partition != 2:
        x_ = np.array([load_reshape_img(images_folder + fname) for fname in df_.index])
        x_ = x_.reshape(x_.shape[0], 218, 178, 3)
        y_ = np_utils.to_categorical(df_[attr],2)
        
    # test
    else:
        x_ = []
        y_ = []

        for index, target in df_.iterrows():
            im = cv2.imread(images_folder + index)
            im = cv2.resize(cv2.cvtColor(im, cv2.COLOR_BGR2RGB), (IMG_WIDTH, IMG_HEIGHT)).astype(np.float32) / 255.0
            im = np.expand_dims(im, axis =0)
            x_.append(im)
            y_.append(target[attr])

    return x_, y_

### Data Augmentation as part of pre-training

In [None]:
# Generate image generator for data augmentation
datagen =  ImageDataGenerator(
  #preprocessing_function=preprocess_input,
  rotation_range=30,
  width_shift_range=0.2,
  height_shift_range=0.2,
  shear_range=0.2,
  zoom_range=0.2,
  horizontal_flip=True
)

# load one image and reshape
img = load_img(EXAMPLE_PIC)
x = img_to_array(img)/255.
x = x.reshape((1,) + x.shape)

# plot 10 augmented images of the loaded iamge
plt.figure(figsize=(20,10))
plt.suptitle('Data Augmentation', fontsize=28)

i = 0
for batch in datagen.flow(x, batch_size=1):
    plt.subplot(3, 5, i+1)
    plt.grid(False)
    plt.imshow( batch.reshape(218, 178, 3))
    
    if i == 9:
        break
    i += 1
    
plt.show()

In [None]:
# train data
x_train, y_train = generate_df(0, 'Male', TRAINING_SAMPLES)

train_datagen =  ImageDataGenerator(preprocessing_function=preprocess_input,
                                    rotation_range=30,
                                    width_shift_range=0.2,
                                    height_shift_range=0.2,
                                    shear_range=0.2,
                                    zoom_range=0.2,
                                    horizontal_flip=True)

train_datagen.fit(x_train)
train_generator = train_datagen.flow(x_train, y_train, batch_size=BATCH_SIZE,)

In [None]:
# validation data
x_valid, y_valid = generate_df(1, 'Male', VALIDATION_SAMPLES)

### Network Initialization

In [None]:
# InceptionV3 with imagenet weights
inceptionv3 = InceptionV3(weights='/content/gdrive/My Drive/Colab Notebooks/pretrained_inceptionv3_celeba/inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5',
                          include_top=False,
                          input_shape=(IMG_HEIGHT, IMG_WIDTH, 3))

print("number of layers:", len(inc_model.layers))

# Replacement for top layer classifier
x = inceptionv3.output
x = GlobalAveragePooling2D()(x)
x = Dense(1024, activation="relu")(x)
x = Dropout(0.5)(x)
x = Dense(512, activation="relu")(x)
predictions = Dense(2, activation="softmax")(x)

# Network for pretraining
model = Model(inputs=inceptionv3.input, outputs=predictions)

# Freezing low-level layers (general features)
for layer in model.layers[:52]:
    layer.trainable = False

model.compile(optimizer=SGD(lr=0.0001, momentum=0.9),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

### Callbacks

In [None]:
checkpoint = ModelCheckpoint('/content/gdrive/My Drive/Colab Notebooks/pretrained_inceptionv3_celeba.hdf5', 
                             monitor='val_accuracy', 
                             verbose=1, 
                             save_best_only=True, 
                             mode='max')

callbacks_list = [checkpoint]

### Pretraining using CelebA dataset

In [None]:
history = model.fit(train_generator,
                  validation_data = (x_valid, y_valid),
                  steps_per_epoch= TRAINING_SAMPLES/BATCH_SIZE,
                  epochs= NUM_EPOCHS,
                  callbacks=callbacks_list,
                  verbose=1)

### Accuracy & Loss Plots

In [None]:
# Loss vs Epochs
plt.figure(figsize=(12, 6))
plt.plot(history.history['loss'], label = 'train')
plt.plot(history.history['val_loss'], label = 'valid')
plt.legend()
plt.title('Loss')
plt.show();

In [None]:
# Accuracy vs Epochs
plt.figure(figsize=(12, 6))
plt.plot(history.history['accuracy'], label = 'train')
plt.plot(history.history['val_accuracy'], label = 'valid')
plt.legend()
plt.title('Accuracy')
plt.show();

### Evaluation

In [None]:
#load the best model
model.load_weights('/content/gdrive/My Drive/Colab Notebooks/pretrained_inceptionv3_celeba.hdf5')

In [None]:
# test data
x_test, y_test = generate_df(2, 'Male', TEST_SAMPLES)

# prediction
model_predictions = [np.argmax(model.predict(feature)) for feature in x_test]

# report test accuracy
test_accuracy = 100 * np.sum(np.array(model_predictions)==y_test) / len(model_predictions)
print(f"Test accuracy: {test_accuracy:.4f}")
print('f1_score:', f1_score(y_test, model_predictions))