This notebook is running on Amazon Sagemaker. 

In [None]:
import os
import random
from datetime import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, auc, classification_report, 
                             confusion_matrix, f1_score, make_scorer, 
                             precision_score, precision_recall_fscore_support, 
                             recall_score, roc_auc_score, roc_curve)
from sklearn.impute import SimpleImputer

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_io as tfio
from tensorflow import keras
from tensorflow.keras.applications import ResNet50, VGG16
from tensorflow.keras.applications.resnet50 import preprocess_input
from tensorflow.keras.backend import clear_session
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.layers import (Concatenate, Dense, Dropout, 
                                     Flatten, GlobalAveragePooling2D, 
                                     GlobalMaxPooling2D, Input)
from tensorflow.keras.metrics import AUC
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing import image
from tensorflow.keras.callbacks import Callback
from tensorflow.keras import backend as K
from tensorflow.keras.regularizers import l2


pd.set_option('display.max_rows', 500)
pd.set_option("display.max_columns", 500)
pd.set_option('display.float_format', lambda x: '%.6f' % x)

# Set environment variables for reproducibility
os.environ['TF_DETERMINISTIC_OPS'] = '1'
os.environ['PYTHONHASHSEED'] = '0'

# Set seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)
random.seed(42)


In [None]:
!nvidia-smi


## Data Ingest

In [None]:
import pickle
dataset_train = pickle.loads(open('full_dataset_train.json', 'rb').read())
dataset_test = pickle.loads(open('full_dataset_test.json', 'rb').read())

In [None]:
print(dataset_train.keys())
print(dataset_test.keys())

In [None]:
train_frame_features = dataset_train['train_frame_features']
train_tag_features = dataset_train['train_tag_features']
train_audio_features = dataset_train['train_audio_features']
train_labels = dataset_train['train_labels']
train_creative_code = dataset_train['train_creative_code']
tag_column_name = dataset_train['tag_feature_name']

test_frame_features = dataset_test['test_frame_features']
test_tag_features = dataset_test['test_tag_features']
test_audio_features = dataset_test['test_audio_features']
test_labels = dataset_test['test_labels']
test_creative_code = dataset_test['test_creative_code']

print("Shapes:")
print("train_frame_features:", train_frame_features.shape)
print("train_tag_features:", train_tag_features.shape)
print("train_audio_features:", train_audio_features.shape)
print("train_labels:", train_labels.shape)


In [None]:
TAG_N = train_tag_features.shape[1]

In [None]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))


In [None]:
baseline_features = ['feature_1', 'feature_2', 'feature_3', 'feature_4']

indices = [tag_column_name.index(feature) for feature in baseline_features]

print(indices)

train_tag_base_features = tf.gather(train_tag_features, indices, axis=1)  
test_tag_base_features = tf.gather(test_tag_features, indices, axis=1)  


In [None]:
nan_mask = tf.math.is_nan(train_tag_base_features)
nan_exists = tf.reduce_any(nan_mask)
print("\nDo any NaN values exist in the selected columns?")
print(nan_exists.numpy())

nan_mask = tf.math.is_nan(test_tag_base_features)
nan_exists = tf.reduce_any(nan_mask)
print("\nDo any NaN values exist in the selected columns?")
print(nan_exists.numpy())

## Define Functions

In [None]:
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras import backend as K


class_weights = compute_class_weight('balanced', classes=np.unique(train_labels.numpy().reshape(-1)), y=train_labels.numpy().reshape(-1))
class_weights_dict = {i: weight for i, weight in enumerate(class_weights)}

def evaluate_default(predictions, test_labels=test_labels, threshold=0.5):
    predictions_binary = (predictions > threshold).astype(int)

    accuracy = accuracy_score(test_labels, predictions_binary)
    precision = precision_score(test_labels, predictions_binary)
    recall = recall_score(test_labels, predictions_binary)
    f1 = f1_score(test_labels, predictions_binary)
    auc = roc_auc_score(test_labels, predictions)

    print(f"AUC: {auc}")
    
     # Print classification report
    print("\n"+"*"*30+"Classification Report (Threshold = {threshold}):".format(threshold=threshold))
    print(classification_report(test_labels, predictions_binary))
    print(confusion_matrix(test_labels, predictions_binary.reshape(-1)))

def plot_metrics_vs_threshold(y_test, predictions):
    fpr, tpr, thresholds = roc_curve(y_test, predictions)
    
    plot_ths = []
    recall_at_thresholds = []
    precision_at_thresholds = []
    f1_at_thresholds = []
    marco_recall_thresholds = []
    best_marco_recall = -10e100
    best_marco_recall_threshold = 0
    # Calculate recall, precision, and F1 score at each threshold
    for threshold in thresholds:
        if threshold > 1: continue 
        y_pred_threshold = (predictions > threshold).astype(int)
        precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred_threshold, average=None)
        precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(y_test, y_pred_threshold, average='macro')
        recall_at_thresholds.append(recall[1])
        precision_at_thresholds.append(precision[1])
        f1_at_thresholds.append(f1[1])
        marco_recall_thresholds.append(recall_macro)
        plot_ths.append(threshold)
        # print(round(threshold,2), recall_macro)
        if (recall_macro > best_marco_recall) and (recall[0] > 0.7) and (recall[1] > 0.3):
            best_marco_recall = recall_macro
            best_marco_recall_threshold = threshold
    
    print("\n"+"*"*30+"Evaluate metrics vs. Threshold for true class + Marco Recall")
        
    # Plot the metrics
    fig, axs = plt.subplots(1, 4, figsize=(12, 3), sharex=True)

    # Plot recall vs. threshold
    axs[0].plot(plot_ths, recall_at_thresholds, label='Recall', color='blue')
    axs[0].set_xlabel('Threshold')
    axs[0].set_xlim([0,1])
    axs[0].set_ylabel('Recall')
    axs[0].set_title('Recall vs. Threshold')
    axs[0].legend()
    axs[0].grid(True)

    # Plot precision vs. threshold
    axs[1].plot(plot_ths, precision_at_thresholds, label='Precision', color='green')
    axs[1].set_xlabel('Threshold')
    axs[0].set_xlim([0,1])
    axs[1].set_ylabel('Precision')
    axs[1].set_title('Precision vs. Threshold')
    axs[1].legend()
    axs[1].grid(True)

    # Plot F1 score vs. threshold
    axs[2].plot(plot_ths, f1_at_thresholds, label='F1 Score', color='red')
    axs[2].set_xlabel('Threshold')
    axs[0].set_xlim([0,1])
    axs[2].set_ylabel('F1 Score')
    axs[2].set_title('F1 Score vs. Threshold')
    axs[2].legend()
    axs[2].grid(True)

    # Plot F1 score vs. threshold
    axs[3].plot(plot_ths, marco_recall_thresholds, label='Marco Recall', color='black')
    axs[3].set_xlabel('Threshold')
    axs[0].set_xlim([0,1])
    axs[3].set_ylabel('Marco Recall')
    axs[3].set_title('Marco Recall vs. Threshold')
    axs[3].legend()
    axs[3].grid(True)

    # Show the plots
    plt.tight_layout()
    plt.show()

    print()
    
    print('best recall_macro', best_marco_recall)
    print('best_marco_recall_threshold', best_marco_recall_threshold)
    if best_marco_recall_threshold > 0:
        evaluate_default(predictions, y_test, best_marco_recall_threshold)
    else: 
        print('Cannot Beat Baseline.')

def evaluate_classification(y_true, y_pred_proba):
    y_pred = (np.array(y_pred_proba) > 0.5).astype(int)
    auc = roc_auc_score(y_true, y_pred_proba)
    accuracy = accuracy_score(y_true, y_pred)

    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average=None)
    precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(y_true, y_pred, average='macro')
    precision_weighted, recall_weighted, f1_weighted, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted')
    
    return {
        'AUC': auc,
        'Accuracy': accuracy,
        '0-Precision': precision[0],
        '0-Recall': recall[0],
        '0-F1-Score': f1[0],
        '1-Precision': precision[1],
        '1-Recall': recall[1],
        '1-F1-Score': f1[1],
        'Macro-Precision': precision_macro,
        'Macro-Recall': recall_macro,
        'Macro-F1-Score': f1_macro,
        'Weighted-Precision': precision_weighted,
        'Weighted-Recall': recall_weighted,
        'Weighted-F1-Score': f1_weighted
    }


In [None]:

def focal_loss_with_class_weights(gamma=2., alpha=0.25, class_weight=None):
    ## Not used in current models - did nt perform well 
    def focal_loss_fixed(y_true, y_pred):
        y_pred = K.clip(y_pred, K.epsilon(), 1 - K.epsilon())
        alpha_t = y_true * alpha + (1 - y_true) * (1 - alpha)
        p_t = y_true * y_pred + (1 - y_true) * (1 - y_pred)
        focal_loss = -alpha_t * K.pow(1 - p_t, gamma) * K.log(p_t)
        
        if class_weight is not None:
            weights = tf.reduce_sum(class_weight * y_true, axis=-1)
            focal_loss *= weights
        
        return K.mean(focal_loss)
    return focal_loss_fixed

def train_model(model11, model_name, trainset, trainlabels, valset, vallabels, 
                epochs, batch_size,
                if_callback, class_weights_setup):
    checkpoint_filepath = 'model_checkpoints/{model_name}/best_model-{datetime}.h5'.format(model_name=model_name, datetime=datetime.today().strftime('%Y-%m-%d_%H:%M:%S'))
    model_checkpoint_callback = ModelCheckpoint(
        filepath=checkpoint_filepath,
        save_weights_only=False,
        monitor='val_loss',  
        mode='min',  
        save_best_only=True
    ) 
    
    if if_callback:
        callbacks_setup = [model_checkpoint_callback]
    else: 
        callbacks_setup = None

    history = model11.fit(trainset, 
                          trainlabels, 
                          epochs=epochs, 
                          batch_size=batch_size,
                          class_weight=class_weights_setup,
                          callbacks=callbacks_setup,
                          validation_data=(valset, vallabels))

    # Plot the training and validation loss
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.title('Training and Validation Loss')
    plt.show()
    
    plt.plot(history.history['auc'], label='Training AUC')
    plt.plot(history.history['val_auc'], label='Validation AUC')
    plt.xlabel('Epoch')
    plt.ylabel('AUC')
    plt.legend()
    plt.title('Training and Validation AUC')
    plt.show()
 
    return model11

## MODEL 1 One image + output 

In [None]:

resnet_base = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

for layer in resnet_base.layers:
    layer.trainable = False

input_image = Input(shape=(224, 224, 3))

base_model = resnet_base(input_image)
x = Flatten()(base_model)

x = Dense(512, activation='relu')(x)
x = Dropout(0.5)(x)
output = Dense(1, activation='sigmoid')(x)

model11 = Model(inputs=input_image, outputs=output)

model11.compile(optimizer='adam', 
              loss='binary_crossentropy', 
              metrics=[AUC(name='auc', curve='ROC')]
              )
model11.summary()

# Total params: 75,100,033
# Trainable params: 51,512,321
# Non-trainable params: 23,587,712

In [None]:
model_name = 'model11'
trainset = train_frame_one_image
trainlabels = train_labels
valset = test_frame_one_image 
vallabels = test_labels
class_weights_setup = class_weights_dict 
model11 = train_model(model11, model_name, trainset, trainlabels, valset, vallabels, 
                     epochs=20, batch_size=32,
                     if_callback=True, class_weights_setup=class_weights_dict)


In [None]:
evaluation = model11.evaluate(valset, vallabels)
print(f"Test Loss: {evaluation[0]}, Test Accuracy: {evaluation[1]}")

model11_predictions = model11.predict(valset)

evaluate_default(model11_predictions)

plot_metrics_vs_threshold(vallabels, model11_predictions)


In [None]:
clear_session()


## MODEL 2: Pooling Way (Inspired by MVCNN)

### MODEL 2.1 total 6 images  

In [None]:
l2_reg = keras.regularizers.l2(0.004)

pre_trained_base = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

for layer in pre_trained_base.layers:
    layer.trainable = False

input_frames = Input(shape=(6, 224, 224, 3))
input_images = tf.unstack(input_frames, axis=1)

view_features = [pre_trained_base(input_image) for input_image in input_images]
view_pool = tf.reduce_max(tf.stack(view_features, axis=1), axis=1)

flatten_view = Flatten()(view_pool)

# # Fully connected layer (1)
# fc6 = Dense(4096, activation='relu', kernel_regularizer=l2_reg, name='fc6')(flatten_view)
# dropout6 = Dropout(0.6, name='dropout6')(fc6)

# fc7 = Dense(4096, activation='relu', kernel_regularizer=l2_reg, name='fc7')(dropout6)
# dropout7 = Dropout(0.6, name='dropout7')(fc7)

# Fully connected layer (2)
fc6 = Dense(512, activation='relu', kernel_regularizer=l2_reg, name='fc6')(flatten_view)
dropout6 = Dropout(0.6, name='dropout6')(fc6)

fc7 = Dense(256, activation='relu', kernel_regularizer=l2_reg, name='fc7')(dropout6)
dropout7 = Dropout(0.6, name='dropout7')(fc7)

fc8 = Dense(1, activation='sigmoid', kernel_regularizer=l2_reg, name='fc8')(dropout7)

model21 = Model(inputs=input_frames, outputs=fc8)

model21.compile(optimizer='adam', 
              loss='binary_crossentropy', 
              metrics=[AUC(name='auc', curve='ROC')])

model21.summary()

# resnet50 - original 4096 unit for fc6 fc7
# Total params: 451,419,009
# Trainable params: 427,831,297
# Non-trainable params: 23,587,712

# resnet50 - 512 + 256 - simpler network 
# Total params: 75,100,033
# Trainable params: 51,512,321
# Non-trainable params: 23,587,712


In [None]:
# ml.g4dn.4xlarge --> works for VGG16(7,7,512) with batch_size=32 but not Resnet50(7,7,2048) with batch_size even 4
trainset = train_frame_features
trainlabels = train_labels
valset = test_frame_features 
vallabels = test_labels
class_weights_setup = class_weights_dict 
model21 = train_model(model21, 'model21', trainset, trainlabels, valset, vallabels, 
                     epochs=20, batch_size=32,
                     if_callback=True, class_weights_setup=class_weights_dict)


In [None]:
evaluation = model21.evaluate(valset, vallabels)
print(f"Test Loss: {evaluation[0]}, Test Accuracy: {evaluation[1]}")
model21_predictions = model21.predict(valset)
evaluate_default(model21_predictions)
plot_metrics_vs_threshold(vallabels, model21_predictions)


### MODEL 2.2 total 6 images + audio 

In [None]:
l2_reg = keras.regularizers.l2(0.004)

pre_trained_base = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

for layer in pre_trained_base.layers:
    layer.trainable = False

input_frames = Input(shape=(6, 224, 224, 3))
input_images = tf.unstack(input_frames, axis=1)

input_audio = Input(shape=(1024,), name='audio_input')  # Assuming YAMNet outputs 1024-dimensional embeddings

view_features = [pre_trained_base(input_image) for input_image in input_images]
view_pool = tf.reduce_max(tf.stack(view_features, axis=1), axis=1)

flatten_view = Flatten()(view_pool)

combined_features = Concatenate()([flatten_view, input_audio])

fc6 = Dense(512, activation='relu', kernel_regularizer=l2_reg, name='fc6')(combined_features)
dropout6 = Dropout(0.6, name='dropout6')(fc6)

fc7 = Dense(256, activation='relu', kernel_regularizer=l2_reg, name='fc7')(dropout6)
dropout7 = Dropout(0.6, name='dropout7')(fc7)

fc8 = Dense(1, activation='sigmoid', kernel_regularizer=l2_reg, name='fc8')(dropout7)

model22 = Model(inputs=[input_frames, input_audio], outputs=fc8)

model22.compile(optimizer='adam', 
              loss='binary_crossentropy', 
              metrics=[AUC(name='auc', curve='ROC')])

model22.summary()


In [None]:

trainset = [train_frame_features, train_audio_features]
trainlabels = train_labels
valset = [test_frame_features, test_audio_features] 
vallabels = test_labels
class_weights_setup = class_weights_dict 
model22 = train_model(model22, 'model22', trainset, trainlabels, valset, vallabels, 
                     epochs=20, batch_size=32,
                     if_callback=True, 
                      class_weights_setup=class_weights_dict)



In [None]:

evaluation = model22.evaluate(valset, vallabels)
print(f"Test Loss: {evaluation[0]}, Test Accuracy: {evaluation[1]}")

model22_predictions = model22.predict(valset)

evaluate_default(model22_predictions)

plot_metrics_vs_threshold(vallabels, model22_predictions)


### MODEL 2.3 total 6 images + audio + TAG

In [None]:

l2_reg = keras.regularizers.l2(0.004)

pre_trained_base = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

for layer in pre_trained_base.layers:
    layer.trainable = False

input_frames = Input(shape=(6, 224, 224, 3))
input_images = tf.unstack(input_frames, axis=1)

input_audio = Input(shape=(1024,), name='audio_input')  # Assuming YAMNet outputs 1024-dimensional embeddings
input_tag = Input(shape=(TAG_N,), name='tag_input')  

view_features = [pre_trained_base(input_image) for input_image in input_images]

view_pool = tf.reduce_max(tf.stack(view_features, axis=1), axis=1)

flatten_view = Flatten()(view_pool)


combined_features = Concatenate()([flatten_view, input_audio, input_tag])

# Fully connected layer (2)
fc6 = Dense(512, activation='relu', kernel_regularizer=l2_reg, name='fc6')(combined_features)
dropout6 = Dropout(0.6, name='dropout6')(fc6)

fc7 = Dense(256, activation='relu', kernel_regularizer=l2_reg, name='fc7')(dropout6)
dropout7 = Dropout(0.6, name='dropout7')(fc7)

fc8 = Dense(1, activation='sigmoid', kernel_regularizer=l2_reg, name='fc8')(dropout7)

model23 = Model(inputs=[input_frames, input_audio, input_tag], outputs=fc8)


model23.compile(optimizer='adam', 
                loss='binary_crossentropy', 
              metrics=[AUC(name='auc', curve='ROC')])

model23.summary()


In [None]:
imputer = SimpleImputer(strategy='median')
train_tag_features_imputed = imputer.fit_transform(train_tag_features)
test_tag_features_imputed = imputer.transform(test_tag_features)


trainset = [train_frame_features, train_audio_features, train_tag_features_imputed]
trainlabels = train_labels
valset = [test_frame_features, test_audio_features, test_tag_features_imputed] 
vallabels = test_labels
class_weights_setup = class_weights_dict 
model23 = train_model(model23, 'model23', trainset, trainlabels, valset, vallabels, 
                     epochs=20, batch_size=32,
                     if_callback=True, class_weights_setup=class_weights_dict)



In [None]:

evaluation = model23.evaluate(valset, vallabels)
print(f"Test Loss: {evaluation[0]}, Test Accuracy: {evaluation[1]}")

model23_predictions = model23.predict(valset)

evaluate_default(model23_predictions)

plot_metrics_vs_threshold(vallabels, model23_predictions)


### MODEL 2.4 6 images + audio + BASE TAG

In [None]:

l2_reg = keras.regularizers.l2(0.004)

pre_trained_base = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

for layer in pre_trained_base.layers:
    layer.trainable = False

input_frames = Input(shape=(6, 224, 224, 3))
input_images = tf.unstack(input_frames, axis=1)

input_audio = Input(shape=(1024,), name='audio_input')  # Assuming YAMNet outputs 1024-dimensional embeddings
input_tag = Input(shape=(4,), name='tag_input')  

view_features = [pre_trained_base(input_image) for input_image in input_images]
# Perform element-wise maximum to combine the view features
view_pool = tf.reduce_max(tf.stack(view_features, axis=1), axis=1)

flatten_view = Flatten()(view_pool)

combined_features = Concatenate()([flatten_view, input_audio, input_tag])

# Fully connected layer (2)
fc6 = Dense(512, activation='relu', kernel_regularizer=l2_reg, name='fc6')(combined_features)
dropout6 = Dropout(0.6, name='dropout6')(fc6)

fc7 = Dense(256, activation='relu', kernel_regularizer=l2_reg, name='fc7')(dropout6)
dropout7 = Dropout(0.6, name='dropout7')(fc7)

fc8 = Dense(1, activation='sigmoid', kernel_regularizer=l2_reg, name='fc8')(dropout7)

model232 = Model(inputs=[input_frames, input_audio, input_tag], outputs=fc8)

model232.compile(optimizer='adam', 
              loss='binary_crossentropy', 
              metrics=[AUC(name='auc', curve='ROC')])

model232.summary()


In [None]:

trainset = [train_frame_features, train_audio_features, train_tag_base_features]
trainlabels = train_labels
valset = [test_frame_features, test_audio_features, test_tag_base_features] 
vallabels = test_labels
class_weights_setup = class_weights_dict 
model232 = train_model(model232, 'model232', trainset, trainlabels, valset, vallabels, 
                     epochs=20, batch_size=32,
                     if_callback=True, 
                     class_weights_setup=class_weights_dict)

In [None]:

evaluation = model232.evaluate(valset, vallabels)
print(f"Test Loss: {evaluation[0]}, Test Accuracy: {evaluation[1]}")

model232_predictions = model232.predict(valset)

evaluate_default(model232_predictions)

plot_metrics_vs_threshold(vallabels, model232_predictions)


## MODEL 3: Augment Way 



### Preprocess Data 

In [None]:
def predict_and_average(model, frame_features, audio_features, tag_features):
    predictions = []
    for i in range(frame_features.shape[1]):  # Iterate over the 6 images
        image = frame_features[:, i, :, :, :]
        audio = audio_features
        tags = tag_features
        pred = model.predict([image, audio, tags])
        predictions.append(pred)
    avg_predictions = np.mean(predictions, axis=0)
    return avg_predictions.reshape(-1)



In [None]:
train_creative_code_expanded = [item for item in train_creative_code for _ in range(6)]

train_frame_features_expanded = tf.reshape(train_frame_features, (-1, 224, 224, 3))

train_tag_features_expanded = tf.tile(train_tag_features[:, tf.newaxis, :], [1, 6, 1])
train_tag_features_expanded = tf.reshape(train_tag_features_expanded, (-1, 219))
train_tag_base_features_expanded = tf.tile(train_tag_base_features[:, tf.newaxis, :], [1, 6, 1])
train_tag_base_features_expanded = tf.reshape(train_tag_base_features_expanded, (-1, 4))

train_audio_features_expanded = tf.tile(train_audio_features[:, tf.newaxis, :], [1, 6, 1])
train_audio_features_expanded = tf.reshape(train_audio_features_expanded, (-1, 1024))

train_labels_expanded = tf.tile(train_labels[:, tf.newaxis, :], [1, 6, 1])
train_labels_expanded = tf.reshape(train_labels_expanded, (-1, 1))

# Check the shapes
print("Expanded train_frame_features shape:", train_frame_features_expanded.shape)
print("Expanded train_tag_features shape:", train_tag_features_expanded.shape)
print("Expanded train_tag_base_features shape:", train_tag_base_features_expanded.shape)
print("Expanded train_audio_features shape:", train_audio_features_expanded.shape)
print("Expanded train_labels shape:", train_labels_expanded.shape)
print("Expanded train_creative_code shape:", len(train_creative_code_expanded))



In [None]:
test_frame_features_expanded = tf.reshape(test_frame_features, (-1, 224, 224, 3))

# Step 2: Duplicate the other features
test_tag_features_expanded = tf.tile(test_tag_features[:, tf.newaxis, :], [1, 6, 1])
test_tag_features_expanded = tf.reshape(test_tag_features_expanded, (-1, 219))

test_tag_base_features_expanded = tf.tile(test_tag_base_features[:, tf.newaxis, :], [1, 6, 1])
test_tag_base_features_expanded = tf.reshape(test_tag_base_features_expanded, (-1, 4))

test_audio_features_expanded = tf.tile(test_audio_features[:, tf.newaxis, :], [1, 6, 1])
test_audio_features_expanded = tf.reshape(test_audio_features_expanded, (-1, 1024))

test_labels_expanded = tf.tile(test_labels[:, tf.newaxis, :], [1, 6, 1])
test_labels_expanded = tf.reshape(test_labels_expanded, (-1, 1))

test_creative_code_expanded = [item for item in test_creative_code for _ in range(6)]

print("Expanded test_frame_features shape:", test_frame_features_expanded.shape)
print("Expanded test_tag_features shape:", test_tag_features_expanded.shape)
print("Expanded test_tag_base_features shape:", test_tag_base_features_expanded.shape)
print("Expanded test_audio_features shape:", test_audio_features_expanded.shape)
print("Expanded test_labels shape:", test_labels_expanded.shape)
print("Expanded test_creative_code shape:", len(test_creative_code_expanded))


### MODEL 3.1 6 images 

In [None]:

l2_reg = l2(0.004)

resnet_base = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
for layer in resnet_base.layers:
    layer.trainable = False

input_image = Input(shape=(224, 224, 3), name='image_input')
image_features = resnet_base(input_image)
flatten_image = GlobalAveragePooling2D()(image_features)

fc1 = Dense(512, activation='relu', kernel_regularizer=l2_reg, name='fc1')(flatten_image)
dropout1 = Dropout(0.6, name='dropout1')(fc1)
fc2 = Dense(256, activation='relu', kernel_regularizer=l2_reg, name='fc2')(dropout1)
dropout2 = Dropout(0.6, name='dropout2')(fc2)
output = Dense(1, activation='sigmoid', kernel_regularizer=l2_reg, name='output')(dropout2)

# Define the model
model31 = Model(inputs=[input_image, input_audio, input_tag_base], outputs=output)
model31.compile(optimizer='adam', 
                loss='binary_crossentropy', 
                metrics=[tf.keras.metrics.AUC(name='auc', curve='ROC')])

model31.summary()


In [None]:

trainset = [train_frame_features_expanded, train_audio_features_expanded, train_tag_base_features_expanded]
trainlabels = train_labels_expanded
valset = [test_frame_features_expanded, test_audio_features_expanded, test_tag_base_features_expanded] 
vallabels = test_labels_expanded
class_weights_setup = class_weights_dict 
model31 = train_model(model31, 'model31', trainset, trainlabels, valset, vallabels, 
                     epochs=20, batch_size=32,
                     if_callback=True, class_weights_setup=class_weights_dict)


In [None]:
model31_predictions = predict_and_average(model31, test_frame_features, test_audio_features, test_tag_base_features)

evaluate_default(model31_predictions)

plot_metrics_vs_threshold(test_labels, model31_predictions)



### MODEL 3.2 6 images + BASE TAG

In [None]:

l2_reg = l2(0.004)

resnet_base = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
for layer in resnet_base.layers:
    layer.trainable = False

input_tag_base = Input(shape=(4,), name='input_tag_base_input')  

input_image = Input(shape=(224, 224, 3), name='image_input')
image_features = resnet_base(input_image)
flatten_image = GlobalAveragePooling2D()(image_features)

combined_features = Concatenate()([flatten_image, input_tag_base])

fc1 = Dense(512, activation='relu', kernel_regularizer=l2_reg, name='fc1')(combined_features)
dropout1 = Dropout(0.6, name='dropout1')(fc1)
fc2 = Dense(256, activation='relu', kernel_regularizer=l2_reg, name='fc2')(dropout1)
dropout2 = Dropout(0.6, name='dropout2')(fc2)
output = Dense(1, activation='sigmoid', kernel_regularizer=l2_reg, name='output')(dropout2)

# Define the model
model32 = Model(inputs=[input_image, input_audio, input_tag_base], outputs=output)
model32.compile(optimizer='adam',  
                loss='binary_crossentropy', 
                metrics=[tf.keras.metrics.AUC(name='auc', curve='ROC')])

model32.summary()


In [None]:

trainset = [train_frame_features_expanded, train_audio_features_expanded, train_tag_base_features_expanded]
trainlabels = train_labels_expanded
valset = [test_frame_features_expanded, test_audio_features_expanded, test_tag_base_features_expanded] 
vallabels = test_labels_expanded
class_weights_setup = class_weights_dict 
model32 = train_model(model32, 'model32', trainset, trainlabels, valset, vallabels, 
                     epochs=20, batch_size=32,
                     if_callback=True, class_weights_setup=class_weights_dict)


In [None]:
model32_predictions = predict_and_average(model32, test_frame_features, test_audio_features, test_tag_base_features)

evaluate_default(model32_predictions)

plot_metrics_vs_threshold(test_labels, model32_predictions)



### MODEL 3.3 6 images + audio + BASE TAG

In [None]:
l2_reg = l2(0.004)

resnet_base = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

for layer in resnet_base.layers:
    layer.trainable = False

input_image = Input(shape=(224, 224, 3), name='image_input')
input_audio = Input(shape=(1024,), name='audio_input')  # Assuming YAMNet outputs 1024-dimensional embeddings
fc_audio = Dense(100, activation='relu', kernel_regularizer=l2_reg, name='fc_audio')(input_audio)
input_tag_base = Input(shape=(4,), name='tag_base_input')  

image_features = resnet_base(input_image)
flatten_image = GlobalAveragePooling2D()(image_features)
fc_image = Dense(512, activation='relu', kernel_regularizer=l2_reg, name='fc_image')(flatten_image)

combined_features = Concatenate()([fc_image, fc_audio, input_tag_base])

fc1 = Dense(512, activation='relu', kernel_regularizer=l2_reg, name='fc1')(combined_features)
dropout1 = Dropout(0.6, name='dropout1')(fc1)
fc2 = Dense(256, activation='relu', kernel_regularizer=l2_reg, name='fc2')(dropout1)
dropout2 = Dropout(0.6, name='dropout2')(fc2)
output = Dense(1, activation='sigmoid', kernel_regularizer=l2_reg, name='output')(dropout2)

model33 = Model(inputs=[input_image, input_audio, input_tag_base], outputs=output)
model33.compile(optimizer='adam', 
                loss='binary_crossentropy', 
                metrics=[tf.keras.metrics.AUC(name='auc', curve='ROC')])

model33.summary()


In [None]:

trainset = [train_frame_features_expanded, train_audio_features_expanded, train_tag_base_features_expanded]
trainlabels = train_labels_expanded
valset = [test_frame_features_expanded, test_audio_features_expanded, test_tag_base_features_expanded] 
vallabels = test_labels_expanded
class_weights_setup = class_weights_dict 
model33 = train_model(model33, 'model33', trainset, trainlabels, valset, vallabels, 
                     epochs=20, batch_size=32,
                     if_callback=True, class_weights_setup=class_weights_dict)


In [None]:

model33_predictions = predict_and_average(model33, test_frame_features, test_audio_features, test_tag_base_features)

evaluate_default(model33_predictions)

plot_metrics_vs_threshold(test_labels, model33_predictions)



### MODEL 3.4 6 images + audio + ALL TAG

In [None]:

l2_reg = l2(0.004)

resnet_base = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
# Freeze the ResNet50 layers
for layer in resnet_base.layers:
    layer.trainable = False

input_image = Input(shape=(224, 224, 3), name='image_input')

input_audio = Input(shape=(1024,), name='audio_input')  # Assuming YAMNet outputs 1024-dimensional embeddings
fc_audio = Dense(100, activation='relu', kernel_regularizer=l2_reg, name='fc_audio')(input_audio)


input_tag = Input(shape=(TAG_N,), name='input_tag')  

image_features = resnet_base(input_image)
flatten_image = GlobalAveragePooling2D()(image_features)
fc_image = Dense(512, activation='relu', kernel_regularizer=l2_reg, name='fc_image')(flatten_image)


combined_features = Concatenate()([fc_image, fc_audio, input_tag])

fc1 = Dense(512, activation='relu', kernel_regularizer=l2_reg, name='fc1')(combined_features)
dropout1 = Dropout(0.6, name='dropout1')(fc1)
fc2 = Dense(256, activation='relu', kernel_regularizer=l2_reg, name='fc2')(dropout1)
dropout2 = Dropout(0.6, name='dropout2')(fc2)
output = Dense(1, activation='sigmoid', kernel_regularizer=l2_reg, name='output')(dropout2)

model34 = Model(inputs=[input_image, input_audio, input_tag], outputs=output)
model34.compile(optimizer='adam', 
                loss='binary_crossentropy', 
                metrics=[tf.keras.metrics.AUC(name='auc', curve='ROC')])

model34.summary()


In [None]:
imputer = SimpleImputer(strategy='median')
train_tag_features_imputed = imputer.fit_transform(train_tag_features_expanded)
test_tag_features_imputed = imputer.transform(test_tag_features_expanded)


trainset = [train_frame_features_expanded, train_audio_features_expanded, train_tag_features_imputed]
trainlabels = train_labels_expanded
valset = [test_frame_features_expanded, test_audio_features_expanded, test_tag_features_imputed] 
vallabels = test_labels_expanded
class_weights_setup = class_weights_dict 
model34 = train_model(model34, 'model34', trainset, trainlabels, valset, vallabels, 
                     epochs=20, batch_size=32,
                     if_callback=True, class_weights_setup=class_weights_dict)


In [None]:

test_tag_features_imputed = imputer.transform(test_tag_features)


model34_predictions = predict_and_average(model34, test_frame_features, test_audio_features, test_tag_features_imputed)

evaluate_default(model34_predictions)

plot_metrics_vs_threshold(test_labels, model34_predictions)



## Summary


In [None]:
# Evaluate models
results_11  = evaluate_classification(test_labels, model11_predictions)
results_21  = evaluate_classification(test_labels, model21_predictions)
results_22  = evaluate_classification(test_labels, model22_predictions)
results_23  = evaluate_classification(test_labels, model23_predictions)
results_232 = evaluate_classification(test_labels, model232_predictions)
results_31  = evaluate_classification(test_labels, model31_predictions)
results_32  = evaluate_classification(test_labels, model32_predictions)
results_33  = evaluate_classification(test_labels, model33_predictions)
results_34  = evaluate_classification(test_labels, model34_predictions)

# Create a DataFrame
results_df = pd.DataFrame({
    '11: 1st Image (DNN Baseline)': results_11,
    '21: 6 Images (Pooling)': results_21,
    '22: 6 Images (Pooling) + Audio(1024d)': results_22,
    '23: 6 Images (Pooling) + Audio(1024d) + TAG(219d)': results_23,
    '232: 6 Images (Pooling) + Audio(1024d) + TAG(4d)': results_232,
    '31: Augment: 1 image': results_31,
    '32: Augment: 1 image TAG(4d)': results_32,
    '33: Augment: 1 image + Audio(1024d) + TAG(4d)': results_33,
    '34: Augment: 1 image + Audio(1024d) + TAG(219d)': results_34

})

# Transpose the DataFrame for better readability
results_df = results_df.T



In [None]:
def format_values(val):
    val = 100*val
    return f"{val:.0f}%"

# Apply formatting
styled_results_df = (results_df.style
    .format(format_values)
    .background_gradient(cmap='Greens')
    .set_properties(**{'width': '120px'}))

# Display the styled DataFrame
styled_results_df


## Best Model - Explainability

In [None]:
from tensorflow.keras.models import load_model

final_model = load_model('model_checkpoints/final_model-2024-08-15_14:39:54.h5')

model32_predictions = predict_and_average(final_model, test_frame_features, test_tag_base_features)

evaluate_default(model32_predictions, test_labels)
plot_metrics_vs_threshold(test_labels, model32_predictions)



In [None]:

final_predictions = []
for i in range(test_frame_features.shape[1]):  # Iterate over the 6 images
    image = test_frame_features[:, i, :, :, :]
    tags = test_tag_base_features
    pred = final_model.predict([image, tags])
    final_predictions.append(pred)

df = pd.DataFrame(np.array(final_predictions).reshape(6, 246).T, columns=['1', '2', '3', '4', '5', '6'])

means = df.median()
# sorted_columns = means.sort_values().index
# df_sorted = df[sorted_columns]
df_melted = df.melt(var_name='Set', value_name='Values')
plt.figure(figsize=(10, 6))
ax = sns.boxplot(x='Set', y='Values', data=df_melted)
for i, set_name in enumerate(['1', '2', '3', '4', '5', '6']):
    mean_val = means[set_name]
    ax.text(i, mean_val + 0.02, f'{mean_val:.2f}', ha='center', va='bottom', fontsize=10, color='black')
plt.title('Box Plot of Predictions For Each Image')
plt.xlabel('Images')
plt.ylabel('Prediction')
plt.show()



df = pd.DataFrame(np.array(final_predictions).reshape(6, 246).T, columns=['1', '2', '3', '4', '5', '6'])
df['target'] = test_labels
df_melted = df.melt(var_name='Set', id_vars=['target'], value_name='Values')
plt.figure(figsize=(10, 6))
ax = sns.boxplot(x='Set', y='Values', data=df_melted, hue='target')
plt.title('Box Plot of Predictions For Each Image By Real Target')
plt.xlabel('Images')
plt.ylabel('Prediction')
plt.show()


In [None]:
pred_pos_creatives = set(np.array(test_creative_code)[np.where((model32_predictions>=0.5) == 1)[0]])
real_pos_creatives = set(np.array(test_creative_code)[np.where(test_labels == 1)[0]])
true_positive_samples = real_pos_creatives.intersection(pred_pos_creatives)
print(true_positive_samples)

In [None]:
indices = np.where(test_labels == 1)[0]
print(np.array(test_creative_code)[indices])

In [None]:

def get_gradcam_heatmap(model, img_array, tag_array, last_conv_layer_name, pred_index=None):
    with tf.device('/CPU:0'):
        grad_model = Model(
            [model.inputs], 
            [model.get_layer(last_conv_layer_name).output, model.output]
        )

        with tf.GradientTape() as tape:
            conv_outputs, predictions = grad_model([img_array, tag_array])
            if pred_index is None:
                pred_index = tf.argmax(predictions[0])
            class_channel = predictions[:, pred_index]

        grads = tape.gradient(class_channel, conv_outputs)

        pooled_grads = tf.reduce_mean(grads, axis=(0, 1, 2))

        conv_outputs = conv_outputs[0]
        heatmap = conv_outputs @ pooled_grads[..., tf.newaxis]
        heatmap = tf.squeeze(heatmap)

        heatmap = tf.maximum(heatmap, 0) / tf.math.reduce_max(heatmap)
        return heatmap.numpy()
    
def reverse_preprocess(img):
    # Convert from BGR to RGB (since preprocess_input might convert it)
    img = img[..., ::-1]
    mean = np.array([103.939, 116.779, 123.68])
    img = img + mean
    img = np.clip(img, 0, 255)
    return img 

def display_gradcam(img_array, heatmap, info, alpha=0.3):
    img = img_array[0]
    
    img = reverse_preprocess(img)
    img = img.astype('uint8')

    heatmap_resized = tf.image.resize(heatmap[..., np.newaxis], (img.shape[0], img.shape[1]))
    heatmap_resized = tf.squeeze(heatmap_resized, axis=-1)
    heatmap_3d = tf.stack([heatmap_resized] * 3, axis=-1)    
    heatmap_normalized = heatmap_3d / tf.reduce_max(heatmap_3d)
    
    heatmap_resized = tf.image.resize(heatmap[..., np.newaxis], (img.shape[0], img.shape[1]))
    heatmap_resized = tf.squeeze(heatmap_resized, axis=-1)

    heatmap2d_normalized = heatmap_resized / tf.reduce_max(heatmap_resized)

    jet_heatmap = plt.cm.viridis(heatmap2d_normalized.numpy())[:, :, :3]  # Use 'plasma' or 'viridis'
    img = tf.cast(img, dtype=tf.float32) / 255.0

    superimposed_img = jet_heatmap * alpha + img
    superimposed_img = np.uint8(255 * superimposed_img)

    fig = plt.figure(figsize=(15, 5))

    # Original image
    plt.subplot(1, 3, 1)
    plt.imshow(img)
    plt.title('Original Image')
    plt.axis('off')

    # # Heatmap
    # plt.subplot(1, 4, 2)
    # plt.imshow(heatmap_normalized, cmap='jet')
    # plt.title('Heatmap')
    # plt.axis('off')

    # Heatmap - REFINED 
    plt.subplot(1, 3, 2)
    plt.imshow(jet_heatmap)
    plt.title('Refined Heatmap')
    plt.axis('off')

    # Superimposed image
    plt.subplot(1, 3, 3)
    plt.imshow(superimposed_img)
    plt.title('Superimposed Image')
    plt.axis('off')
    code = info['code']
    no_img = info['no_img']
    pred_score = info['pred_score']
    fig.suptitle(f'Creative: {code} [No.{no_img}] - Model Pred = {pred_score:.2f}')      
    fig.savefig(f'images_output/{code}_{no_img}.png')
    plt.show()

for creative_code in true_positive_samples:
    print('#'*50, creative_code, '#'*50,)
    hero_creative_index = np.where(np.array(test_creative_code_expanded) == creative_code)[0]

    img_array = tf.gather(test_frame_features_expanded, hero_creative_index.tolist())
    tag_array = tf.gather(test_tag_base_features_expanded, hero_creative_index.tolist()) 
    last_conv_layer_name = "conv5_block3_out"   

    preds = final_model.predict([img_array,tag_array])

    for i in range(6):
        img = tf.expand_dims(img_array[i], axis=0)
        tag = tf.expand_dims(tag_array[i], axis=0) 
        heatmap = get_gradcam_heatmap(final_model, img, tag, last_conv_layer_name)
        display_gradcam(img, heatmap, info= {'code':creative_code, 'no_img': i, 'pred_score':preds[i][0]})
    print()

    
