In [1]:
import clip
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import json 
import sys 
from PIL import Image

import warnings
warnings.filterwarnings("ignore", category=UserWarning)

sys.path.insert(0, os.path.dirname(os.getcwd()))
import loaders
from utils import scores as sc
from utils import evaluation as ev

In [2]:
# Load CLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model, preprocess = clip.load("ViT-B/32", device=device)

In [3]:
import logging
# Set Logger
logger = logging.getLogger('notebook_logger')
logger.setLevel(logging.DEBUG)
stream_handler = logging.StreamHandler()
stream_handler.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
stream_handler.setFormatter(formatter)
logger.addHandler(stream_handler)

# Define Data Loader

In [4]:
# Showing one example
# TODO: Add VSNR for cosine similarity
data_loader = loaders.DataLoader(data_source = "qa", logger=logger)
df_table_origin = data_loader.load_dialogue_df()

# Define OOD Categories below

In [5]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
import torch.nn.functional as F
from tqdm.notebook import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
from models.DNN import model 

tqdm.pandas()

# Load CLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model, preprocess = clip.load("ViT-B/32", device=device)

category = "animal"
logger.info(f"Processing OOD Category: {category}")
ood_category = [category]
ind_category = [x for x in data_loader.supercategories if x not in ood_category]

df_table = df_table_origin.copy()
df_table['OOD'] = df_table['supercategory'].apply(lambda x: 0 if any(item in x for item in ood_category) else 1)
df_table['OOD'].value_counts()

if data_loader.data_source == "real":
    dialogue_clip = np.load(f'{data_loader.data_dir}/CLIP/mmd_dialogs_truncate/mmd_clip_dialog_features.npy')
    df_table['dialogue_clip'] = list(dialogue_clip)
    image_clip = np.load(f'{data_loader.data_dir}/CLIP/mmd_imgs/mmd_clip_img_features.npy')
    image_annotation = pd.read_json(f'{data_loader.data_dir}/CLIP/mmd_imgs/mmd_imgs_filenames.json')
    image_annotation = image_annotation.rename(columns={0:"img_file"}).join(pd.DataFrame(pd.DataFrame(image_clip.tolist()).apply(np.array, axis=1)))
    image_annotation.rename(columns={0:"image_clip"}, inplace=True)
    df_table = df_table.merge(image_annotation, on='img_file', how='left')

elif data_loader.data_source == "qa":
    dialogue_clip = np.load(f'{data_loader.data_dir}/CLIP/qa_dialogs_truncate/qa_clip_dialog_features_single.npy')
    df_table['dialogue_clip'] = list(dialogue_clip)
    df_table['image_file'] = df_table['image_id'].astype('str') + '.jpg'
    image_clip = np.load(f'{data_loader.data_dir}/CLIP/qa_imgs/qa_clip_img_features.npy')
    image_annotation = pd.read_json(f'{data_loader.data_dir}/CLIP/qa_imgs/all_img_names.json')
    image_annotation = image_annotation.rename(columns={0:"image_file"})
    image_annotation['image_clip'] = list(image_clip)
    df_table = df_table.merge(image_annotation, on='image_file', how='left') 

mlb = MultiLabelBinarizer(classes=ind_category)
df_table['encoded_label'] = list(mlb.fit_transform(df_table['supercategory']))
encoded_df = pd.DataFrame(df_table['encoded_label'].tolist(), columns=ind_category)
df_table = pd.concat([df_table, encoded_df], axis=1)

logger.info(f"Calculating Similarity Scores")
categories_clip = {}
for categories in ind_category:
    text = 'Category ' + categories
    text_tokens = clip.tokenize([text]).to(device)  
    with torch.no_grad():
        text_features = clip_model.encode_text(text_tokens).flatten().cpu().numpy()
        categories_clip[categories] = text_features

def calculate_similarity_score(row, type = "image"):
    if type == "image":
        column = 'image_clip'
    elif type == "dialogue":
        column = 'dialogue_clip'
    cosine_sim = 0
    cosine_sim_max = 0
    for categories in ind_category:
        text_features = categories_clip[categories]
        cosine_sim_current = np.dot(text_features, row[column]) / (np.linalg.norm(text_features) * np.linalg.norm(row[column]))
        cosine_sim += cosine_sim_current
        cosine_sim_max = max(cosine_sim_max, cosine_sim_current)
    return cosine_sim, cosine_sim_max

df_table['image_score'], df_table['image_score_max'] = zip(*df_table.progress_apply(calculate_similarity_score, axis=1))
#dialogue_df_segment['dialogue_score'], dialogue_df_segment['dialogue_score_max'] = zip(*dialogue_df_segment.progress_apply(calculate_similarity_score, type = "dialogue", axis=1))
df_table['dialogue_score'], df_table['dialogue_score_max'] = zip(*df_table.progress_apply(calculate_similarity_score, axis=1, args=('dialogue',)))

def image_text_similarity(row):
    a = row['dialogue_clip']
    b = row['image_clip']
    cos_sim = np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
    return cos_sim
df_table['image_text_similarity'] = df_table.apply(image_text_similarity, axis=1)
df_table['overall_simialrity'] = (df_table['image_score_max'] + df_table['dialogue_score_max'])
df_table['overall_simialrity_sum'] = (df_table['image_score'] + df_table['dialogue_score'])
df_table['overall_simialrity_transform'] = 4 * df_table['image_text_similarity'] * df_table['overall_simialrity']
df_table['overall_simialrity_sum_transform'] = 4 * df_table['image_text_similarity'] * df_table['overall_simialrity_sum']

image_model_loader = model.model_loader(logger=logger,
                                num_epochs=6,
                                learning_rate=0.001,
                                proportion = 0.8,
                                seed = 20)

dialogue_model_loader = model.model_loader(logger=logger,
                                num_epochs=6,
                                learning_rate=0.001,
                                seed = 20)

(
df_ind_train, 
df_test, 
X_train_image, 
X_test_image, 
X_train_dialogue, 
X_test_dialogue, 
Y_train, 
Y_test) = image_model_loader.create_dataset(data_loader, df_table, add_mismatch = True, mismatch_num = 10000)

df_test['image_text_similarity'] = df_test.apply(image_text_similarity, axis=1)
image_model_loader.train_model(X_train_image, Y_train, X_test_image, Y_test, ood_category = '_'.join(ood_category))
dialogue_model_loader.train_model(X_train_dialogue, Y_train, X_test_dialogue, Y_test, ood_category = '_'.join(ood_category))


2024-08-04 13:59:48,949 - notebook_logger - INFO - Processing OOD Category: animal
2024-08-04 13:59:55,945 - notebook_logger - INFO - Calculating Similarity Scores


  0%|          | 0/122218 [00:00<?, ?it/s]

  0%|          | 0/122218 [00:00<?, ?it/s]

2024-08-04 14:00:38,242 - notebook_logger - INFO - Setting random seed: 20
2024-08-04 14:00:38,246 - notebook_logger - INFO - Setting random seed: 20
  0%|          | 0/6 [00:00<?, ?it/s]2024-08-04 14:00:52,939 - notebook_logger - INFO - Epoch 1, Train Loss: 0.1802, Train Accuracy: 0.4734, Test Loss: 0.2094, Test Accuracy: 0.3659
 17%|█▋        | 1/6 [00:13<01:06, 13.34s/it]2024-08-04 14:01:06,239 - notebook_logger - INFO - Epoch 2, Train Loss: 0.1612, Train Accuracy: 0.5166, Test Loss: 0.1851, Test Accuracy: 0.4481
 33%|███▎      | 2/6 [00:26<00:53, 13.32s/it]2024-08-04 14:01:19,618 - notebook_logger - INFO - Epoch 3, Train Loss: 0.1502, Train Accuracy: 0.5392, Test Loss: 0.1753, Test Accuracy: 0.4670
 50%|█████     | 3/6 [00:40<00:40, 13.35s/it]2024-08-04 14:01:32,777 - notebook_logger - INFO - Epoch 4, Train Loss: 0.1428, Train Accuracy: 0.5543, Test Loss: 0.1695, Test Accuracy: 0.4888
 67%|██████▋   | 4/6 [00:53<00:26, 13.27s/it]2024-08-04 14:01:45,898 - notebook_logger - INFO - Ep

In [6]:
score_type_list = ["prob", "energy", "logits", "msp", "odin", "mahalanobis"]
for score_type in score_type_list:
    if score_type != "mahalanobis":
        image_score_sum, image_score_max = image_model_loader.evaluate_on_test(X_test_image, 
                                                            Y_test,
                                                            score_type=score_type,\
                                                            return_score=True)
        dialogue_score_sum, dialogue_score_max = dialogue_model_loader.evaluate_on_test(X_test_dialogue, 
                                                            Y_test,
                                                            score_type=score_type,\
                                                            return_score=True)

    else:
        image_score_sum, image_score_max = image_model_loader.evaluate_on_test(X_test_image, 
                                                            Y_test,
                                                            score_type=score_type,\
                                                            return_score=True,
                                                            X_train=X_train_image,
                                                            Y_train=Y_train)
        dialogue_score_sum, dialogue_score_max = dialogue_model_loader.evaluate_on_test(X_test_dialogue, 
                                                            Y_test,
                                                            score_type=score_type,\
                                                            return_score=True,
                                                            X_train=X_train_dialogue,
                                                            Y_train=Y_train)
    
    df_test[f'{score_type}_sum_image'] = image_score_sum
    df_test[f'{score_type}_max_image'] = image_score_max
    df_test[f'{score_type}_sum_dialogue'] = dialogue_score_sum
    df_test[f'{score_type}_max_dialogue'] = dialogue_score_max
    if score_type in ["energy", "logits", "prob", "odin", "mahalanobis"]:
        df_test[f'{score_type}_overall_simialrity_sum'] = df_test[f'{score_type}_sum_image'] + df_test[f'{score_type}_sum_dialogue']




2024-08-04 14:03:30,338 - notebook_logger - INFO - Test Loss: 0.1817, Test Accuracy: 0.4604
2024-08-04 14:03:32,011 - notebook_logger - INFO - Test Loss: 0.3974, Test Accuracy: 0.1788
2024-08-04 14:03:33,646 - notebook_logger - INFO - Test Loss: 0.1817, Test Accuracy: 0.4604
2024-08-04 14:03:35,376 - notebook_logger - INFO - Test Loss: 0.3974, Test Accuracy: 0.1788
2024-08-04 14:03:36,965 - notebook_logger - INFO - Test Loss: 0.1817, Test Accuracy: 0.4604
2024-08-04 14:03:38,564 - notebook_logger - INFO - Test Loss: 0.3974, Test Accuracy: 0.1788
2024-08-04 14:03:40,309 - notebook_logger - INFO - Test Loss: 0.1817, Test Accuracy: 0.4604
2024-08-04 14:03:41,917 - notebook_logger - INFO - Test Loss: 0.3974, Test Accuracy: 0.1788
2024-08-04 14:03:48,465 - notebook_logger - INFO - Test Loss: 0.1817, Test Accuracy: 0.4604
2024-08-04 14:03:54,797 - notebook_logger - INFO - Test Loss: 0.3974, Test Accuracy: 0.1788
2024-08-04 14:07:04,903 - notebook_logger - INFO - Test Loss: 0.1817, Test Accur

In [7]:
score_type_list = ["prob", "energy", "logits", "msp", "odin", "mahalanobis"]

for score_type in score_type_list:
    if score_type != "mahalanobis":
        image_score_sum, image_score_max = image_model_loader.evaluate_on_test(X_test_image, 
                                                            Y_test,
                                                            score_type=score_type,\
                                                            return_score=True)
        dialogue_score_sum, dialogue_score_max = dialogue_model_loader.evaluate_on_test(X_test_dialogue, 
                                                            Y_test,
                                                            score_type=score_type,\
                                                            return_score=True)

    else:
        image_score_sum, image_score_max = image_model_loader.evaluate_on_test(X_test_image, 
                                                            Y_test,
                                                            score_type=score_type,\
                                                            return_score=True,
                                                            X_train=X_train_image,
                                                            Y_train=Y_train)
        dialogue_score_sum, dialogue_score_max = dialogue_model_loader.evaluate_on_test(X_test_dialogue, 
                                                            Y_test,
                                                            score_type=score_type,\
                                                            return_score=True,
                                                            X_train=X_train_dialogue,
                                                            Y_train=Y_train)
    
    df_test[f'{score_type}_sum_image'] = image_score_sum
    df_test[f'{score_type}_max_image'] = image_score_max
    df_test[f'{score_type}_sum_dialogue'] = dialogue_score_sum
    df_test[f'{score_type}_max_dialogue'] = dialogue_score_max
    if score_type == "mahalanobis":
        df_test[f'{score_type}_max_image_tranform'] = 4 / df_test['image_text_similarity'] * df_test[f'{score_type}_max_image'] 
        df_test[f'{score_type}_max_dialogue_tranform'] = 4 / df_test['image_text_similarity'] * df_test[f'{score_type}_max_dialogue']
        df_test[f'{score_type}_sum_image_tranform'] = 4 / df_test['image_text_similarity'] * df_test[f'{score_type}_sum_image']
        df_test[f'{score_type}_sum_dialogue_tranform'] = 4 / df_test['image_text_similarity'] * df_test[f'{score_type}_sum_dialogue'] 
    else:
        df_test[f'{score_type}_max_image_tranform'] = 4 * df_test['image_text_similarity'] * df_test[f'{score_type}_max_image'] 
        df_test[f'{score_type}_max_dialogue_tranform'] = 4 * df_test['image_text_similarity'] * df_test[f'{score_type}_max_dialogue']
        df_test[f'{score_type}_sum_image_tranform'] = 4 * df_test['image_text_similarity'] * df_test[f'{score_type}_sum_image']
        df_test[f'{score_type}_sum_dialogue_tranform'] = 4 * df_test['image_text_similarity'] * df_test[f'{score_type}_sum_dialogue']
    df_test[f'{score_type}_overall_simialrity_max'] = df_test[f'{score_type}_max_image'] + df_test[f'{score_type}_max_dialogue']
    df_test[f'{score_type}_overall_simialrity_max_transform'] =  df_test[f'{score_type}_max_image_tranform'] + df_test[f'{score_type}_max_dialogue_tranform']
    if score_type in ["energy", "logits", "prob", "odin", "mahalanobis"]:
        df_test[f'{score_type}_overall_simialrity_sum'] = df_test[f'{score_type}_sum_image'] + df_test[f'{score_type}_sum_dialogue']
        df_test[f'{score_type}_overall_simialrity_sum_transform'] = df_test[f'{score_type}_sum_image_tranform'] + df_test[f'{score_type}_sum_dialogue_tranform']


# Initialize lists to store data
metrics = []
values = []
scores = [] 

def eval_dict(score):
    return {
        "FPR": lambda x: ev.fpr_evaluation(x['OOD'].values, x[score].values, 0.95),
        "AUROC": lambda x: ev.auroc_evaluation(x['OOD'].values, x[score].values),
        "AUPR": lambda x: ev.aupr_evaluation(x['OOD'].values, x[score].values)
    }

# Define the metrics and corresponding functions
metric_functions = {
    "Max Cosine": {
        "Image": eval_dict('image_score_max'),
        "Dialogue": eval_dict('dialogue_score_max'),
        "Overall": eval_dict('overall_simialrity'),
        "Overall_Transform": eval_dict('overall_simialrity_transform')
    },
    "Sum Cosine": {
        "Image": eval_dict('image_score'),
        "Dialogue": eval_dict('dialogue_score'),
        "Overall": eval_dict('overall_simialrity_sum'),

        "Overall_Transform": eval_dict('overall_simialrity_sum_transform')
    },
    "Energy Sum": {
        "Image": eval_dict('energy_sum_image'),
        "Dialogue": eval_dict('energy_sum_dialogue'),
        "Overall": eval_dict('energy_overall_simialrity_sum'),
        "Overall_Transform": eval_dict('energy_overall_simialrity_sum_transform')
    },
    "Energy Max": {
        "Image": eval_dict('energy_max_image'),
        "Dialogue": eval_dict('energy_max_dialogue'),
        "Overall": eval_dict('energy_overall_simialrity_max'),
        "Overall_Transform": eval_dict('energy_overall_simialrity_max_transform')
    },
    "MSP": {
        "Image": eval_dict('msp_max_image'),
        "Dialogue": eval_dict('msp_max_dialogue'),
        "Overall": eval_dict('msp_overall_simialrity_max'),
        "Overall_Transform": eval_dict('msp_overall_simialrity_max_transform')
    },
    "Max Prob": {
        "Image": eval_dict('prob_max_image'),
        "Dialogue": eval_dict('prob_max_dialogue'),
        "Overall": eval_dict('prob_overall_simialrity_max'),
        "Overall_Transform": eval_dict('prob_overall_simialrity_max_transform')
    },
    "Sum Prob": {
        "Image": eval_dict('prob_sum_image'),
        "Dialogue": eval_dict('prob_sum_dialogue'),
        "Overall": eval_dict('prob_overall_simialrity_sum'),
        "Overall_Transform": eval_dict('prob_overall_simialrity_sum_transform')
    },
    "Max Odin": {
        "Image": eval_dict('odin_max_image'),
        "Dialogue": eval_dict('odin_max_dialogue'),
        "Overall": eval_dict('odin_overall_simialrity_max'),
        "Overall_Transform": eval_dict('odin_overall_simialrity_max_transform')
    },
    "Sum Odin": {
        "Image": eval_dict('odin_sum_image'),
        "Dialogue": eval_dict('odin_sum_dialogue'),
        "Overall": eval_dict('odin_overall_simialrity_sum'),
        "Overall_Transform": eval_dict('odin_overall_simialrity_sum_transform')
    },
    "Max Mahalanobis": {
        "Image": eval_dict('mahalanobis_max_image'),
        "Dialogue": eval_dict('mahalanobis_max_dialogue'),
        "Overall": eval_dict('mahalanobis_overall_simialrity_max'),
        "Overall_Transform": eval_dict('mahalanobis_overall_simialrity_max_transform')
    },
    "Sum Mahalanobis": {
        "Image": eval_dict('mahalanobis_sum_image'),
        "Dialogue": eval_dict('mahalanobis_sum_dialogue'),
        "Overall": eval_dict('mahalanobis_overall_simialrity_sum'),
        "Overall_Transform": eval_dict('mahalanobis_overall_simialrity_sum_transform')
    },
    "Max Logits": {
        "Image": eval_dict('logits_max_image'),
        "Dialogue": eval_dict('logits_max_dialogue'),
        "Overall": eval_dict('logits_overall_simialrity_max'),
        "Overall_Transform": eval_dict('logits_overall_simialrity_max_transform')
    },
    "Sum Logits": {
        "Image": eval_dict('logits_sum_image'),
        "Dialogue": eval_dict('logits_sum_dialogue'),
        "Overall": eval_dict('logits_overall_simialrity_sum'),
        "Overall_Transform": eval_dict('logits_overall_simialrity_sum_transform')
    }
}

# Loop through each metric and calculate values
for score, items in metric_functions.items():
    scores.extend([score] * len(items) * 3)
    for metric, funcs in items.items():
        metrics.extend([metric] * len(funcs))
        values.extend([func(df_test) for func in funcs.values()])

# Create DataFrame
df = pd.DataFrame({"Metric": metrics, "Value": values, "Score": scores})
df['Value'] = df['Value'].apply(lambda x: round(x, 3))
#df_grouped = df.groupby('Metric')['Value'].apply(list).reset_index()
result = df.groupby(['Metric', 'Score'])['Value'].agg(list).unstack().transpose()
result_df = result[['Image', 'Dialogue', 'Overall', 'Overall_Transform']]
result_df.reset_index(inplace=True)
order = ['Max Cosine', 
         'Sum Cosine', 
         'Max Prob', 
         'Sum Prob', 
         'Max Logits', 
         'Sum Logits', 
         'Max Odin',
         'Sum Odin',
         'Max Mahalanobis',
         'Sum Mahalanobis',
         'MSP', 
         'Energy Sum', 
         'Energy Max']
result_df = result_df.set_index('Score').loc[order].reset_index()
def convert_to_percentage(lst):
    return ' / '.join(f'{x*100:.1f}' for x in lst)


result_df['Image'] = result_df['Image'].apply(convert_to_percentage)
result_df['Dialogue'] = result_df['Dialogue'].apply(convert_to_percentage)
result_df['Overall'] = result_df['Overall'].apply(convert_to_percentage)
result_df['Overall_Transform'] = result_df['Overall_Transform'].apply(convert_to_percentage)

latex_table = result_df.to_latex(index=False, column_format='|l|c|c|c|c|', header=["Score", "Image", "Dialogue", "Overall", "Overall_Transform"], escape=False)

print(latex_table)

2024-08-04 14:18:32,977 - notebook_logger - INFO - Test Loss: 0.1817, Test Accuracy: 0.4604
2024-08-04 14:18:34,548 - notebook_logger - INFO - Test Loss: 0.3974, Test Accuracy: 0.1788
2024-08-04 14:18:36,235 - notebook_logger - INFO - Test Loss: 0.1817, Test Accuracy: 0.4604
2024-08-04 14:18:37,847 - notebook_logger - INFO - Test Loss: 0.3974, Test Accuracy: 0.1788
2024-08-04 14:18:39,445 - notebook_logger - INFO - Test Loss: 0.1817, Test Accuracy: 0.4604
2024-08-04 14:18:41,039 - notebook_logger - INFO - Test Loss: 0.3974, Test Accuracy: 0.1788
2024-08-04 14:18:42,664 - notebook_logger - INFO - Test Loss: 0.1817, Test Accuracy: 0.4604
2024-08-04 14:18:44,323 - notebook_logger - INFO - Test Loss: 0.3974, Test Accuracy: 0.1788
2024-08-04 14:18:50,906 - notebook_logger - INFO - Test Loss: 0.1817, Test Accuracy: 0.4604
2024-08-04 14:18:57,297 - notebook_logger - INFO - Test Loss: 0.3974, Test Accuracy: 0.1788
2024-08-04 14:22:05,042 - notebook_logger - INFO - Test Loss: 0.1817, Test Accur

\begin{tabular}{|l|c|c|c|c|}
\toprule
Score & Image & Dialogue & Overall & Overall_Transform \\
\midrule
Max Cosine & 91.2 / 63.1 / 47.7 & 91.4 / 56.8 / 39.5 & 89.1 / 61.4 / 45.2 & 95.5 / 47.7 / 35.6 \\
Sum Cosine & 93.9 / 56.0 / 41.8 & 91.5 / 56.3 / 38.8 & 90.6 / 58.1 / 41.6 & 95.9 / 46.7 / 34.4 \\
Max Prob & 60.3 / 75.8 / 58.2 & 87.1 / 67.3 / 51.1 & 66.9 / 75.1 / 59.6 & 74.8 / 74.1 / 60.9 \\
Sum Prob & 70.9 / 68.4 / 49.0 & 94.7 / 62.2 / 45.4 & 78.3 / 68.7 / 51.6 & 78.4 / 71.4 / 56.4 \\
Max Logits & 60.3 / 75.8 / 58.2 & 87.1 / 67.3 / 51.1 & 61.3 / 76.4 / 61.3 & 62.6 / 78.9 / 68.1 \\
Sum Logits & 92.7 / 58.1 / 43.1 & 98.4 / 50.1 / 40.0 & 97.6 / 55.8 / 43.7 & 97.2 / 53.1 / 40.8 \\
Max Odin & 59.9 / 75.6 / 58.0 & 89.6 / 67.0 / 50.7 & 67.7 / 74.8 / 59.0 & 76.6 / 73.6 / 61.6 \\
Sum Odin & 71.5 / 68.0 / 48.6 & 94.9 / 61.5 / 45.0 & 78.3 / 68.0 / 51.1 & 79.7 / 70.5 / 55.4 \\
Max Mahalanobis & 49.2 / 81.3 / 62.9 & 71.7 / 63.9 / 42.0 & 62.4 / 77.8 / 60.7 & 60.3 / 81.8 / 66.7 \\
Sum Mahalanobis 