In [1]:
import clip
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import json 
import sys 
from PIL import Image

import warnings
warnings.filterwarnings("ignore", category=UserWarning)

sys.path.insert(0, os.path.dirname(os.getcwd()))
import loaders
from utils import scores as sc
from utils import evaluation as ev

In [2]:
import logging
# Set Logger
logger = logging.getLogger('notebook_logger')
logger.setLevel(logging.DEBUG)
stream_handler = logging.StreamHandler()
stream_handler.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
stream_handler.setFormatter(formatter)
logger.addHandler(stream_handler)

# Define Data Loader

In [4]:
# Showing one example
# TODO: Add VSNR for cosine similarity
data_loader = loaders.DataLoader(data_source = "qa", model_type='blip', logger=logger)
df_table_origin = data_loader.load_annotations_df()

# Define OOD Categories below

In [20]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
import torch.nn.functional as F
from tqdm.notebook import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
from models.DNN import model 

tqdm.pandas()

used_model = "blip"

if used_model == "clip":
    input_size = 512
elif used_model == "blip":
    input_size = 256


category = "animal"
logger.info(f"Processing OOD Category: {category}")
ood_category = [category]
ind_category = [x for x in data_loader.supercategories if x not in ood_category]

df_table = df_table_origin.copy()
df_table['OOD'] = df_table['supercategory'].apply(lambda x: 0 if any(item in x for item in ood_category) else 1)
df_table['OOD'].value_counts()

# Dialogue Processing
dialogue_blip = np.load(f'{data_loader.data_dir}/BLIP/qa_dialogs/BLIP_imc_dialog_features.npy')
df_table['dialogue_blip'] = list(dialogue_blip)
## Image Processing
df_table['image_file'] = df_table['image_id'].astype('str') + '.jpg'
image_blip = np.load(f'{data_loader.data_dir}/BLIP/qa_imgs/BLIP_imc_image_features.npy')
df_table['image_blip'] = list(image_blip)


mlb = MultiLabelBinarizer(classes=ind_category)
df_table['encoded_label'] = list(mlb.fit_transform(df_table['supercategory']))
encoded_df = pd.DataFrame(df_table['encoded_label'].tolist(), columns=ind_category)
df_table = pd.concat([df_table, encoded_df], axis=1)

logger.info(f"Calculating Similarity Scores")
def image_text_similarity(row):
    a = row['dialogue_blip']
    b = row['image_blip']
    cos_sim = np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
    return cos_sim

df_table['image_text_similarity'] = df_table.apply(image_text_similarity, axis=1)

image_model_loader = model.model_loader(logger=logger,
                                input_size=input_size,
                                output_size=len(ind_category),
                                num_epochs=6,
                                learning_rate=0.001,
                                proportion = 0.8,
                                seed = 20)

dialogue_model_loader = model.model_loader(logger=logger,        
                                input_size=input_size,
                                output_size=len(ind_category),
                                num_epochs=6,
                                learning_rate=0.001,
                                seed = 20)

(
df_ind_train, 
df_test, 
X_train_image, 
X_test_image, 
X_train_dialogue, 
X_test_dialogue, 
Y_train, 
Y_test
) = image_model_loader.create_dataset(data_loader, df_table, add_mismatch = True, mismatch_num = 20000, used_model='blip')

df_test['image_text_similarity'] = df_test.apply(image_text_similarity, axis=1)
image_model_loader.train_model(X_train_image, Y_train, X_test_image, Y_test, ood_category = '_'.join(ood_category))
dialogue_model_loader.train_model(X_train_dialogue, Y_train, X_test_dialogue, Y_test, ood_category = '_'.join(ood_category))


2024-08-04 15:57:07,541 - notebook_logger - INFO - Processing OOD Category: animal
2024-08-04 15:57:13,898 - notebook_logger - INFO - Calculating Similarity Scores
2024-08-04 15:57:15,388 - notebook_logger - INFO - Setting random seed: 20
2024-08-04 15:57:15,390 - notebook_logger - INFO - Setting random seed: 20
  0%|          | 0/6 [00:00<?, ?it/s]2024-08-04 15:57:29,429 - notebook_logger - INFO - Epoch 1, Train Loss: 0.1697, Train Accuracy: 0.5064, Test Loss: 0.1989, Test Accuracy: 0.4241
 17%|█▋        | 1/6 [00:12<01:03, 12.64s/it]2024-08-04 15:57:42,176 - notebook_logger - INFO - Epoch 2, Train Loss: 0.1499, Train Accuracy: 0.5491, Test Loss: 0.1667, Test Accuracy: 0.5072
 33%|███▎      | 2/6 [00:25<00:50, 12.70s/it]2024-08-04 15:57:54,811 - notebook_logger - INFO - Epoch 3, Train Loss: 0.1389, Train Accuracy: 0.5713, Test Loss: 0.1747, Test Accuracy: 0.4907
 50%|█████     | 3/6 [00:38<00:38, 12.67s/it]2024-08-04 15:58:07,766 - notebook_logger - INFO - Epoch 4, Train Loss: 0.1390,

In [21]:
score_type_list = ["prob", "energy", "logits", "msp", "odin", "mahalanobis"]
for score_type in score_type_list:
    if score_type != "mahalanobis":
        image_score_sum, image_score_max = image_model_loader.evaluate_on_test(X_test_image, 
                                                            Y_test,
                                                            score_type=score_type,\
                                                            return_score=True)
        
        dialogue_score_sum, dialogue_score_max = dialogue_model_loader.evaluate_on_test(X_test_dialogue, 
                                                            Y_test,
                                                            score_type=score_type,\
                                                            return_score=True)

    else:
        image_score_sum, image_score_max = image_model_loader.evaluate_on_test(X_test_image, 
                                                            Y_test,
                                                            score_type=score_type,\
                                                            return_score=True,
                                                            X_train=X_train_image,
                                                            Y_train=Y_train)
        dialogue_score_sum, dialogue_score_max = dialogue_model_loader.evaluate_on_test(X_test_dialogue, 
                                                            Y_test,
                                                            score_type=score_type,\
                                                            return_score=True,
                                                            X_train=X_train_dialogue,
                                                            Y_train=Y_train)
    
    df_test[f'{score_type}_sum_image'] = image_score_sum
    df_test[f'{score_type}_max_image'] = image_score_max
    df_test[f'{score_type}_sum_dialogue'] = dialogue_score_sum
    df_test[f'{score_type}_max_dialogue'] = dialogue_score_max
    if score_type in ["energy", "logits", "prob", "odin", "mahalanobis"]:
        df_test[f'{score_type}_overall_simialrity_sum'] = df_test[f'{score_type}_sum_image'] + df_test[f'{score_type}_sum_dialogue']




2024-08-04 15:59:49,398 - notebook_logger - INFO - Test Loss: 0.1641, Test Accuracy: 0.5033
2024-08-04 15:59:51,050 - notebook_logger - INFO - Test Loss: 0.4463, Test Accuracy: 0.2017
2024-08-04 15:59:52,782 - notebook_logger - INFO - Test Loss: 0.1641, Test Accuracy: 0.5033
2024-08-04 15:59:54,532 - notebook_logger - INFO - Test Loss: 0.4463, Test Accuracy: 0.2017
2024-08-04 15:59:56,245 - notebook_logger - INFO - Test Loss: 0.1641, Test Accuracy: 0.5033
2024-08-04 15:59:58,032 - notebook_logger - INFO - Test Loss: 0.4463, Test Accuracy: 0.2017
2024-08-04 15:59:59,716 - notebook_logger - INFO - Test Loss: 0.1641, Test Accuracy: 0.5033
2024-08-04 16:00:01,379 - notebook_logger - INFO - Test Loss: 0.4463, Test Accuracy: 0.2017
2024-08-04 16:00:07,753 - notebook_logger - INFO - Test Loss: 0.1641, Test Accuracy: 0.5033
2024-08-04 16:00:14,036 - notebook_logger - INFO - Test Loss: 0.4463, Test Accuracy: 0.2017
2024-08-04 16:01:07,604 - notebook_logger - INFO - Test Loss: 0.1641, Test Accur

In [22]:
score_type_list = ["prob", "energy", "logits", "msp", "odin", "mahalanobis"]

for score_type in score_type_list:
    if score_type != "mahalanobis":
        image_score_sum, image_score_max = image_model_loader.evaluate_on_test(X_test_image, 
                                                            Y_test,
                                                            score_type=score_type,\
                                                            return_score=True)
        dialogue_score_sum, dialogue_score_max = dialogue_model_loader.evaluate_on_test(X_test_dialogue, 
                                                            Y_test,
                                                            score_type=score_type,\
                                                            return_score=True)

    else:
        image_score_sum, image_score_max = image_model_loader.evaluate_on_test(X_test_image, 
                                                            Y_test,
                                                            score_type=score_type,\
                                                            return_score=True,
                                                            X_train=X_train_image,
                                                            Y_train=Y_train)
        dialogue_score_sum, dialogue_score_max = dialogue_model_loader.evaluate_on_test(X_test_dialogue, 
                                                            Y_test,
                                                            score_type=score_type,\
                                                            return_score=True,
                                                            X_train=X_train_dialogue,
                                                            Y_train=Y_train)
    
    df_test[f'{score_type}_sum_image'] = image_score_sum
    df_test[f'{score_type}_max_image'] = image_score_max
    df_test[f'{score_type}_sum_dialogue'] = dialogue_score_sum
    df_test[f'{score_type}_max_dialogue'] = dialogue_score_max
    if score_type == "mahalanobis":
        df_test[f'{score_type}_max_image_tranform'] = 4 / df_test['image_text_similarity'] * df_test[f'{score_type}_max_image'] 
        df_test[f'{score_type}_max_dialogue_tranform'] = 4 / df_test['image_text_similarity'] * df_test[f'{score_type}_max_dialogue']
        df_test[f'{score_type}_sum_image_tranform'] = 4 / df_test['image_text_similarity'] * df_test[f'{score_type}_sum_image']
        df_test[f'{score_type}_sum_dialogue_tranform'] = 4 / df_test['image_text_similarity'] * df_test[f'{score_type}_sum_dialogue'] 
    else:
        df_test[f'{score_type}_max_image_tranform'] = 4 * df_test['image_text_similarity'] * df_test[f'{score_type}_max_image'] 
        df_test[f'{score_type}_max_dialogue_tranform'] = 4 * df_test['image_text_similarity'] * df_test[f'{score_type}_max_dialogue']
        df_test[f'{score_type}_sum_image_tranform'] = 4 * df_test['image_text_similarity'] * df_test[f'{score_type}_sum_image']
        df_test[f'{score_type}_sum_dialogue_tranform'] = 4 * df_test['image_text_similarity'] * df_test[f'{score_type}_sum_dialogue']
    df_test[f'{score_type}_overall_simialrity_max'] = df_test[f'{score_type}_max_image'] + df_test[f'{score_type}_max_dialogue']
    df_test[f'{score_type}_overall_simialrity_max_transform'] =  df_test[f'{score_type}_max_image_tranform'] + df_test[f'{score_type}_max_dialogue_tranform']
    if score_type in ["energy", "logits", "prob", "odin", "mahalanobis"]:
        df_test[f'{score_type}_overall_simialrity_sum'] = df_test[f'{score_type}_sum_image'] + df_test[f'{score_type}_sum_dialogue']
        df_test[f'{score_type}_overall_simialrity_sum_transform'] = df_test[f'{score_type}_sum_image_tranform'] + df_test[f'{score_type}_sum_dialogue_tranform']


# Initialize lists to store data
metrics = []
values = []
scores = [] 

def eval_dict(score):
    return {
        "FPR": lambda x: ev.fpr_evaluation(x['OOD'].values, x[score].values, 0.95),
        "AUROC": lambda x: ev.auroc_evaluation(x['OOD'].values, x[score].values),
        "AUPR": lambda x: ev.aupr_evaluation(x['OOD'].values, x[score].values)
    }

# Define the metrics and corresponding functions
metric_functions = {
    "Energy Sum": {
        "Image": eval_dict('energy_sum_image'),
        "Dialogue": eval_dict('energy_sum_dialogue'),
        "Overall": eval_dict('energy_overall_simialrity_sum'),
        "Overall_Transform": eval_dict('energy_overall_simialrity_sum_transform')
    },
    "Energy Max": {
        "Image": eval_dict('energy_max_image'),
        "Dialogue": eval_dict('energy_max_dialogue'),
        "Overall": eval_dict('energy_overall_simialrity_max'),
        "Overall_Transform": eval_dict('energy_overall_simialrity_max_transform')
    },
    "MSP": {
        "Image": eval_dict('msp_max_image'),
        "Dialogue": eval_dict('msp_max_dialogue'),
        "Overall": eval_dict('msp_overall_simialrity_max'),
        "Overall_Transform": eval_dict('msp_overall_simialrity_max_transform')
    },
    "Max Prob": {
        "Image": eval_dict('prob_max_image'),
        "Dialogue": eval_dict('prob_max_dialogue'),
        "Overall": eval_dict('prob_overall_simialrity_max'),
        "Overall_Transform": eval_dict('prob_overall_simialrity_max_transform')
    },
    "Sum Prob": {
        "Image": eval_dict('prob_sum_image'),
        "Dialogue": eval_dict('prob_sum_dialogue'),
        "Overall": eval_dict('prob_overall_simialrity_sum'),
        "Overall_Transform": eval_dict('prob_overall_simialrity_sum_transform')
    },
    "Max Odin": {
        "Image": eval_dict('odin_max_image'),
        "Dialogue": eval_dict('odin_max_dialogue'),
        "Overall": eval_dict('odin_overall_simialrity_max'),
        "Overall_Transform": eval_dict('odin_overall_simialrity_max_transform')
    },
    "Sum Odin": {
        "Image": eval_dict('odin_sum_image'),
        "Dialogue": eval_dict('odin_sum_dialogue'),
        "Overall": eval_dict('odin_overall_simialrity_sum'),
        "Overall_Transform": eval_dict('odin_overall_simialrity_sum_transform')
    },
    "Max Mahalanobis": {
        "Image": eval_dict('mahalanobis_max_image'),
        "Dialogue": eval_dict('mahalanobis_max_dialogue'),
        "Overall": eval_dict('mahalanobis_overall_simialrity_max'),
        "Overall_Transform": eval_dict('mahalanobis_overall_simialrity_max_transform')
    },
    "Sum Mahalanobis": {
        "Image": eval_dict('mahalanobis_sum_image'),
        "Dialogue": eval_dict('mahalanobis_sum_dialogue'),
        "Overall": eval_dict('mahalanobis_overall_simialrity_sum'),
        "Overall_Transform": eval_dict('mahalanobis_overall_simialrity_sum_transform')
    },
    "Max Logits": {
        "Image": eval_dict('logits_max_image'),
        "Dialogue": eval_dict('logits_max_dialogue'),
        "Overall": eval_dict('logits_overall_simialrity_max'),
        "Overall_Transform": eval_dict('logits_overall_simialrity_max_transform')
    },
    "Sum Logits": {
        "Image": eval_dict('logits_sum_image'),
        "Dialogue": eval_dict('logits_sum_dialogue'),
        "Overall": eval_dict('logits_overall_simialrity_sum'),
        "Overall_Transform": eval_dict('logits_overall_simialrity_sum_transform')
    }
}

# Loop through each metric and calculate values
for score, items in metric_functions.items():
    scores.extend([score] * len(items) * 3)
    for metric, funcs in items.items():
        metrics.extend([metric] * len(funcs))
        values.extend([func(df_test) for func in funcs.values()])

# Create DataFrame
df = pd.DataFrame({"Metric": metrics, "Value": values, "Score": scores})
df['Value'] = df['Value'].apply(lambda x: round(x, 3))
#df_grouped = df.groupby('Metric')['Value'].apply(list).reset_index()
result = df.groupby(['Metric', 'Score'])['Value'].agg(list).unstack().transpose()
result_df = result[['Image', 'Dialogue', 'Overall', 'Overall_Transform']]
result_df.reset_index(inplace=True)
order = ['Max Prob', 
         'Sum Prob', 
         'Max Logits', 
         'Sum Logits', 
         'Max Odin',
         'Sum Odin',
         'Max Mahalanobis',
         'Sum Mahalanobis',
         'MSP', 
         'Energy Sum', 
         'Energy Max']
result_df = result_df.set_index('Score').loc[order].reset_index()
def convert_to_percentage(lst):
    return ' / '.join(f'{x*100:.1f}' for x in lst)

result_df['Image'] = result_df['Image'].apply(convert_to_percentage)
result_df['Dialogue'] = result_df['Dialogue'].apply(convert_to_percentage)
result_df['Overall'] = result_df['Overall'].apply(convert_to_percentage)
result_df['Overall_Transform'] = result_df['Overall_Transform'].apply(convert_to_percentage)

latex_table = result_df.to_latex(index=False, column_format='|l|c|c|c|c|', header=["Score", "Image", "Dialogue", "Overall", "Overall_Transform"], escape=False)

print(latex_table)

2024-08-04 16:02:04,002 - notebook_logger - INFO - Test Loss: 0.1641, Test Accuracy: 0.5033
2024-08-04 16:02:05,729 - notebook_logger - INFO - Test Loss: 0.4463, Test Accuracy: 0.2017
2024-08-04 16:02:07,487 - notebook_logger - INFO - Test Loss: 0.1641, Test Accuracy: 0.5033
2024-08-04 16:02:09,290 - notebook_logger - INFO - Test Loss: 0.4463, Test Accuracy: 0.2017
2024-08-04 16:02:10,952 - notebook_logger - INFO - Test Loss: 0.1641, Test Accuracy: 0.5033
2024-08-04 16:02:12,617 - notebook_logger - INFO - Test Loss: 0.4463, Test Accuracy: 0.2017
2024-08-04 16:02:14,415 - notebook_logger - INFO - Test Loss: 0.1641, Test Accuracy: 0.5033
2024-08-04 16:02:16,245 - notebook_logger - INFO - Test Loss: 0.4463, Test Accuracy: 0.2017
2024-08-04 16:02:22,693 - notebook_logger - INFO - Test Loss: 0.1641, Test Accuracy: 0.5033
2024-08-04 16:02:29,140 - notebook_logger - INFO - Test Loss: 0.4463, Test Accuracy: 0.2017
2024-08-04 16:03:22,857 - notebook_logger - INFO - Test Loss: 0.1641, Test Accur

\begin{tabular}{|l|c|c|c|c|}
\toprule
Score & Image & Dialogue & Overall & Overall_Transform \\
\midrule
Max Prob & 64.3 / 71.2 / 45.1 & 80.5 / 67.1 / 42.2 & 66.5 / 72.8 / 49.2 & 67.0 / 78.7 / 56.5 \\
Sum Prob & 78.8 / 64.4 / 39.3 & 96.8 / 55.9 / 35.9 & 89.0 / 62.9 / 41.9 & 74.2 / 72.7 / 51.2 \\
Max Logits & 64.3 / 71.2 / 45.1 & 80.5 / 67.1 / 42.2 & 65.1 / 72.5 / 49.2 & 62.9 / 80.9 / 63.8 \\
Sum Logits & 95.8 / 52.9 / 33.8 & 98.1 / 41.9 / 29.3 & 98.4 / 48.9 / 34.2 & 99.1 / 40.1 / 26.5 \\
Max Odin & 63.9 / 71.1 / 44.9 & 81.4 / 67.2 / 42.1 & 65.9 / 73.0 / 48.9 & 67.7 / 79.3 / 57.7 \\
Sum Odin & 79.1 / 64.2 / 39.2 & 97.0 / 56.1 / 36.0 & 89.1 / 62.8 / 41.8 & 74.5 / 72.5 / 50.9 \\
Max Mahalanobis & 46.9 / 77.7 / 50.6 & 81.0 / 66.9 / 40.5 & 62.1 / 77.0 / 52.3 & 52.6 / 87.7 / 75.4 \\
Sum Mahalanobis & 79.7 / 71.5 / 46.2 & 92.5 / 59.0 / 35.9 & 88.2 / 66.5 / 44.3 & 67.6 / 78.7 / 61.0 \\
MSP & 85.8 / 58.7 / 37.4 & 83.5 / 64.8 / 39.8 & 78.9 / 65.9 / 40.6 & 75.9 / 75.1 / 52.7 \\
Energy Sum & 63.0 