In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import pickle
import warnings
warnings.filterwarnings('ignore')

import keras
from keras_preprocessing.image import ImageDataGenerator
from keras.models import Model, load_model
from keras.layers import Dense
from keras.optimizers import SGD
from keras.callbacks import ModelCheckpoint, LearningRateScheduler

np.random.seed(4042)

# Input data files are available in the read-only "../input/" directory
# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

## Dataset preprocessing functions

In [None]:
def create_path(df, basepath):
    """
    this function creates column of image file path for each image
    """
    df['path'] = df.apply(lambda x: f"{basepath}{x['user_id']}/landmark_aligned_face.{x['face_id']}.{x['original_image']}", axis=1)
    return df

def clean_age(age):
    """
    this function cleans the mislabel in Adience dataset
    """
    age = str(age)
    if age == "29" or age == "34" or age == "35":
        age = "(25, 32)"
    elif age == "13":
        age = "(8, 12)"
    elif age == "22" or age == "23":
        age = "(15, 20)"
    elif age == "36" or age == "45":
        age = "(38, 43)"
    elif age == "32" or age == "(27, 32)":
        age = "(25, 32)"
    elif age == "46" or age == "55":
        age = "(48, 53)"
    elif age == "(38, 42)" or age == "42":
        age = "(38, 43)"
    elif age == "3":
        age = "(4, 6)"
    elif age == "56" or age == "57" or age == "58":
        age = "(60, 100)"
    elif age == "2":
        age = "(0, 2)"
    elif age == "(38, 48)":
        age = "(38, 43)"
    elif age == "(8, 23)":
        age = "(8, 12)"
    return age
        
    
def filter_age(df):
    """
    this function filters out unrecognize age in the dataset
    """
    age_dict = {"(0, 2)":"0",
                "(4, 6)":"1", 
                "(8, 12)":"2",
                "(15, 20)":"3",
                "(25, 32)":"4",
                "(38, 43)":"5",
                "(48, 53)":"6", 
                "(60, 100)":"7"}
    
    df['valid'] = df['age'].apply(lambda x: int(x in age_dict))
    df = df[df['valid'] == 1]
    df['age_label'] = df['age'].apply(lambda x: age_dict[x])
    return df.reset_index(drop=True)

def flow_ready_age(df):
    """
    this function will make a dataframe ready for flow with cols: [image_path, category]
    """
    df = pd.concat([df['path'],df['age_label']],axis=1)
    return df

## Cut-Out Augmentation

In [None]:
class CutOutDataGenerator(ImageDataGenerator):
    def __init__(self,
                 cutout_size=None,
                 n_squares=None,
                 **kwargs):
        '''
        Custom image data generator for cutout regularization.
        Behaves like ImageDataGenerator, but allows color augmentation.
        '''
        super().__init__(
            preprocessing_function=self.augment_cutout,
            **kwargs)

        self.cutout_size = cutout_size
        self.n_squares = n_squares
    
    def augment_cutout(self, image):
        '''Takes an input image and returns a cutout version of it'''
        h, w, channels = image.shape
        new_image = image
        for _ in range(self.n_squares):
            y = tf.random.uniform([1], minval=0, maxval=h, dtype=tf.int32).numpy()[0]
            x = tf.random.uniform([1], minval=0, maxval=w, dtype=tf.int32).numpy()[0]
            y1 = tf.clip_by_value(y - self.cutout_size // 2, 0, h).numpy()
            y2 = tf.clip_by_value(y + self.cutout_size // 2, 0, h).numpy()
            x1 = tf.clip_by_value(x - self.cutout_size // 2, 0, w).numpy()
            x2 = tf.clip_by_value(x + self.cutout_size // 2, 0, w).numpy()
            new_image[y1:y2,x1:x2,:] = 0
        return new_image

## Visualize Age Distribution

In [None]:
# get training and test folds
train_df = pd.concat([pd.read_csv(f"{data_folder}fold_{i}_data.txt", sep="\t") for i in list(range(5))]
test_df = pd.read_csv(f"{data_folder}fold_{k}_data.txt", sep="\t")

# clean age category
train_df['age'] = train_df['age'].apply(clean_age)
test_df['age'] = test_df['age'].apply(clean_age)
train_df = filter_age(train_df)
test_df = filter_age(test_df)


# create image_path, age_label dataframe for image generator
train_df = flow_ready_age(create_path(train_df, img_folder))
test_df = flow_ready_age(create_path(test_df, img_folder))

plt = (train_df['age'].value_counts()/len(train_df)).reindex(index = ['(0, 2)','(4, 6)','(8, 12)','(15, 20)','(25, 32)','(38, 43)','(48, 53)','(60, 100)']).plot(kind='bar', figsize=(6,4))
plt.set_xlabel('Age Group')
plt.set_ylabel('% of dataset')
plt.set_title('Age Group Distribution in All 5 Folds');


## Callbacks

In [None]:
import math

def step_decay(epoch):
    """
    This function is used by Learning Rate Scheduler to adjust learning rate during training
    """
    if epoch < 10:
        lrate = 0.01
    else:
        initial_lrate = 0.005
        drop = 0.5
        epochs_drop = 3.0
        lrate = initial_lrate * math.pow(drop, math.floor((1+epoch)/epochs_drop))
        
    return lrate

lrate = LearningRateScheduler(step_decay)

## Dataset

In [None]:
## Dataset Directory in Colab
#data_folder = '/content/gdrive/My Drive/Colab Notebooks/img_dataset/AdienceGender/'
#img_folder = '/content/gdrive/My Drive/Colab Notebooks/img_dataset/AdienceGender/aligned/'

## Dataset Directory in Kaggle
data_folder = '../input/adiencegender/AdienceGender/'
img_folder = '../input/adiencegender/AdienceGender/aligned/'

In [None]:
# based on pre-trained base
image_size = (218, 178)
batch_size = 32

# Tutorial: https://towardsdatascience.com/how-to-augmentate-data-using-keras-38d84bd1c80c
# Docs: https://keras.io/api/preprocessing/image/#imagedatagenerator-class

train_datagen = CutOutDataGenerator(rotation_range = 6,
                                   width_shift_range = 0.2,
                                   height_shift_range = 0.2,
                                   rescale = 1./255.,
                                   horizontal_flip = True,
                                   cutout_size=40, n_squares=1)

val_datagen = ImageDataGenerator(rescale = 1./255.)

## 5-Fold Cross Validation

In [None]:
K = 5

for k in range(K):
    
    print(f"=================== FOLD {k} ===================")
    # get training and test folds
    train_df = pd.concat([pd.read_csv(f"{data_folder}fold_{i}_data.txt", sep="\t") for i in [j for j in list(range(K)) if j != k]])
    test_df = pd.read_csv(f"{data_folder}fold_{k}_data.txt", sep="\t")
    
    # clean age category
    train_df['age'] = train_df['age'].apply(clean_age)
    test_df['age'] = test_df['age'].apply(clean_age)
    train_df = filter_age(train_df)
    test_df = filter_age(test_df)

    # create image_path, age_label dataframe for image generator
    train_df = flow_ready_age(create_path(train_df, img_folder))
    test_df = flow_ready_age(create_path(test_df, img_folder))

    # Image generator for dataset
    train_generator = train_datagen.flow_from_dataframe(dataframe=train_df,
                                              x_col=train_df.columns[0],
                                              y_col=train_df.columns[1],
                                              batch_size=batch_size,
                                              seed=42,
                                              shuffle=True,
                                              class_mode="categorical",
                                              target_size=image_size,
                                              color_mode='rgb')

    val_generator = val_datagen.flow_from_dataframe(dataframe=test_df,
                                              x_col=test_df.columns[0],
                                              y_col=test_df.columns[1],
                                              batch_size=batch_size,
                                              seed=42,
                                              shuffle=True,
                                              class_mode="categorical",
                                              target_size=image_size,
                                              color_mode='rgb')

    TRAIN_STEP_SIZE = train_generator.n//train_generator.batch_size
    VAL_STEP_SIZE = val_generator.n//val_generator.batch_size

    # Pretrained model
    inceptionv3_celeba = '../input/pretrained-inceptionv3-celeba/pretrained_inceptionv3_celeba.hdf5'
    pretrained_model = load_model(inceptionv3_celeba)
    
    intermediate_layer_model = Model(inputs=pretrained_model.input, outputs=pretrained_model.get_layer('dense_2').output)

    x = pretrained_model.get_layer('dense_2').output
    output = Dense(units=8,activation='softmax',name="output")(x)
    age_model = Model(pretrained_model.input, output)

    # Callbacks
    checkpoint = ModelCheckpoint(f"adience_age_model_fold_{k}_best.h5", 
                             monitor='val_accuracy', 
                             verbose=1, 
                             save_best_only=True, 
                             mode='max')
    
    callbacks_list= [checkpoint, lrate]

    # Training
    age_model.compile(optimizer=SGD(), 
                           loss='categorical_crossentropy',
                           metrics=['accuracy'])

    history = age_model.fit(train_generator,
                          steps_per_epoch=TRAIN_STEP_SIZE,
                          epochs=25,
                          validation_data=val_generator,
                          validation_steps=VAL_STEP_SIZE,
                          callbacks=callbacks_list)

    # saving histories
    with open(f"fold_{k}_lrdecay_age_model_best_HistoryDict", 'wb') as file_pi:
        pickle.dump(history.history, file_pi)

    hist_df = pd.DataFrame(history.history) 

    # save as json too just in case
    with open(f"fold_{k}_lrdecay_age_model_best_HistoryJson.json", mode='w') as f:
        hist_df.to_json(f)

    # history_temp = pickle.load(open(f_dir+'fold1_lrdecay_model_best_HistoryDict', "rb")
    print()