In [1]:
import clip
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import json 
import sys 
from PIL import Image

import warnings
warnings.filterwarnings("ignore", category=UserWarning)

sys.path.insert(0, os.path.dirname(os.getcwd()))
import loaders
from utils import scores as sc
from utils import evaluation as ev

In [2]:
import logging
# Set Logger
logger = logging.getLogger('notebook_logger')
logger.setLevel(logging.DEBUG)
stream_handler = logging.StreamHandler()
stream_handler.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
stream_handler.setFormatter(formatter)
logger.addHandler(stream_handler)

# Define Data Loader

In [3]:
# Showing one example
# TODO: Add VSNR for cosine similarity
data_loader = loaders.DataLoader(data_source = "qa", logger=logger)
df_table_origin = data_loader.load_dialogue_df()

#k = 5
#data_loader.showing_example(k)
#data_loader.show_clip_similarity(k, df_table, model, preprocess)

# Define OOD Categories below

In [31]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
import torch.nn.functional as F
from tqdm.notebook import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
from models.DNN import model 

tqdm.pandas()

# Load CLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model, preprocess = clip.load("ViT-B/32", device=device)

result_total = {}
for category in data_loader.supercategories:
    logger.info(f"Processing OOD Category: {category}")
    ood_category = [category]
    ind_category = [x for x in data_loader.supercategories if x not in ood_category]
    
    df_table = df_table_origin.copy()
    df_table['OOD'] = df_table['supercategory'].apply(lambda x: 1 if any(item in x for item in ind_category) else 0)
    df_table['OOD'].value_counts()

    if data_loader.data_source == "real":
        ## Dialogue Processing
        dialogue_clip = np.load(f'{data_loader.data_dir}/CLIP/mmd_dialogs_truncate/mmd_clip_dialog_features.npy')
        df_table['dialogue_clip'] = list(dialogue_clip)

        ## Image Processing
        image_clip = np.load(f'{data_loader.data_dir}/CLIP/mmd_imgs/mmd_clip_img_features.npy')
        image_annotation = pd.read_json(f'{data_loader.data_dir}/CLIP/mmd_imgs/mmd_imgs_filenames.json')
        image_annotation = image_annotation.rename(columns={0:"img_file"}).join(pd.DataFrame(pd.DataFrame(image_clip.tolist()).apply(np.array, axis=1)))
        image_annotation.rename(columns={0:"image_clip"}, inplace=True)
        df_table = df_table.merge(image_annotation, on='img_file', how='left')

    elif data_loader.data_source == "qa":
        ## Dialogue Processing
        dialogue_clip = np.load(f'{data_loader.data_dir}/CLIP/qa_dialogs_truncate/qa_clip_dialog_features.npy')
        df_table['dialogue_clip'] = list(dialogue_clip)

        ## Image Processing
        df_table['image_file'] = df_table['image_id'].astype('str') + '.jpg'
        image_clip = np.load(f'{data_loader.data_dir}/CLIP/qa_imgs/qa_clip_img_features.npy')
        image_annotation = pd.read_json(f'{data_loader.data_dir}/CLIP/qa_imgs/all_img_names.json')
        image_annotation = image_annotation.rename(columns={0:"image_file"})
        image_annotation['image_clip'] = list(image_clip)
        df_table = df_table.merge(image_annotation, on='image_file', how='left') 

    mlb = MultiLabelBinarizer(classes=ind_category)
    df_table['encoded_label'] = list(mlb.fit_transform(df_table['supercategory']))
    encoded_df = pd.DataFrame(df_table['encoded_label'].tolist(), columns=ind_category)
    df_table = pd.concat([df_table, encoded_df], axis=1)

    logger.info(f"Calculating Similarity Scores")
    #if data_loader.data_source == "qa":
    #    df_table['image_id'] = df_table['image_id'].apply(lambda x: f"COCO_train2014_{int(x):012d}")

    categories_clip = {}
    for categories in ind_category:
        text = 'Category ' + categories
        text_tokens = clip.tokenize([text]).to(device)  
        with torch.no_grad():
            text_features = clip_model.encode_text(text_tokens).flatten().cpu().numpy()
            categories_clip[categories] = text_features

    def calculate_similarity_score(row, type = "image"):
        if type == "image":
            column = 'image_clip'
        elif type == "dialogue":
            column = 'dialogue_clip'

        cosine_sim = 0
        cosine_sim_max = 0
        for categories in ind_category:
            text_features = categories_clip[categories]
            cosine_sim_current = np.dot(text_features, row[column]) / (np.linalg.norm(text_features) * np.linalg.norm(row[column]))
            cosine_sim += cosine_sim_current
            cosine_sim_max = max(cosine_sim_max, cosine_sim_current)

        return cosine_sim, cosine_sim_max

    df_table['image_score'], df_table['image_score_max'] = zip(*df_table.progress_apply(calculate_similarity_score, axis=1))
    #dialogue_df_segment['dialogue_score'], dialogue_df_segment['dialogue_score_max'] = zip(*dialogue_df_segment.progress_apply(calculate_similarity_score, type = "dialogue", axis=1))
    df_table['dialogue_score'], df_table['dialogue_score_max'] = zip(*df_table.progress_apply(calculate_similarity_score, axis=1, args=('dialogue',)))
    
    def image_text_similarity(row):
        a = row['dialogue_clip']
        b = row['image_clip']
        cos_sim = np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
        return cos_sim
    df_table['image_text_similarity'] = df_table.apply(image_text_similarity, axis=1)
    df_table['overall_simialrity'] = (df_table['image_score_max'] + df_table['dialogue_score_max'])
    df_table['overall_simialrity_sum'] = (df_table['image_score'] + df_table['dialogue_score'])
    df_table['overall_simialrity_transform'] = 4 * df_table['image_text_similarity'] * df_table['overall_simialrity']
    df_table['overall_simialrity_sum_transform'] = 4 * df_table['image_text_similarity'] * df_table['overall_simialrity_sum']

    image_model_loader = model.model_loader(logger=logger,
                                    num_epochs=6,
                                    learning_rate=0.001,
                                    proportion = 0.8,
                                    seed = 20)
    
    dialogue_model_loader = model.model_loader(logger=logger,
                                    num_epochs=6,
                                    learning_rate=0.001,
                                    seed = 20)
    
    (
    df_ind_train, 
    df_test, 
    X_train_image, 
    X_test_image, 
    X_train_dialogue, 
    X_test_dialogue, 
    Y_train, 
    Y_test) = image_model_loader.create_dataset(data_loader, df_table, add_mismatch = True, mismatch_num = 5000)

    df_test['image_text_similarity'] = df_test.apply(image_text_similarity, axis=1)

    image_model_loader.train_model(X_train_image, Y_train, X_test_image, Y_test, ood_category = '_'.join(ood_category))
    dialogue_model_loader.train_model(X_train_dialogue, Y_train, X_test_dialogue, Y_test, ood_category = '_'.join(ood_category))

    score_type_list = ["mp", "energy", "logits", "msp"]

    for score_type in score_type_list:
        image_score_sum, image_score_max = image_model_loader.evaluate_on_test(X_test_image, 
                                                            Y_test,
                                                            score_type=score_type,\
                                                            return_score=True)
        
        dialogue_score_sum, dialogue_score_max = dialogue_model_loader.evaluate_on_test(X_test_dialogue, 
                                                            Y_test,
                                                            score_type=score_type,\
                                                            return_score=True)
        
        df_test[f'{score_type}_sum_image'] = image_score_sum
        df_test[f'{score_type}_max_image'] = image_score_max
        df_test[f'{score_type}_sum_dialogue'] = dialogue_score_sum
        df_test[f'{score_type}_max_dialogue'] = dialogue_score_max
        df_test[f'{score_type}_max_image_tranform'] = 4 * df_test['image_text_similarity'] * df_test[f'{score_type}_max_image'] 
        df_test[f'{score_type}_max_dialogue_tranform'] = 4 * df_test['image_text_similarity'] * df_test[f'{score_type}_max_dialogue']
        df_test[f'{score_type}_sum_image_tranform'] = 4 * df_test['image_text_similarity'] * df_test[f'{score_type}_sum_image']
        df_test[f'{score_type}_sum_dialogue_tranform'] = 4 * df_test['image_text_similarity'] * df_test[f'{score_type}_sum_dialogue']

        df_test[f'{score_type}_overall_simialrity_max'] = df_test[f'{score_type}_max_image'] + df_test[f'{score_type}_max_dialogue']
        df_test[f'{score_type}_overall_simialrity_max_transform'] =  df_test[f'{score_type}_max_image_tranform'] + df_test[f'{score_type}_max_dialogue_tranform']
        if score_type in ["energy", "logits"]:
            df_test[f'{score_type}_overall_simialrity_sum'] = df_test[f'{score_type}_sum_image'] + df_test[f'{score_type}_sum_dialogue']
            df_test[f'{score_type}_overall_simialrity_sum_transform'] = df_test[f'{score_type}_sum_image_tranform'] + df_test[f'{score_type}_sum_dialogue_tranform']

    
    # Initialize lists to store data
    metrics = []
    values = []
    scores = [] 

    def eval_dict(score):
        return {
            "FPR": lambda x: ev.fpr_evaluation(x['OOD'].values, x[score].values, 0.95),
            "AUROC": lambda x: ev.auroc_evaluation(x['OOD'].values, x[score].values),
            "AUPR": lambda x: ev.aupr_evaluation(x['OOD'].values, x[score].values)
        }

    # Define the metrics and corresponding functions
    metric_functions = {
        "Max Cosine": {
            "Image": eval_dict('image_score_max'),
            "Dialogue": eval_dict('dialogue_score_max'),
            "Overall": eval_dict('overall_simialrity'),
            "Overall_Transform": eval_dict('overall_simialrity_transform')
        },
        "Sum Cosine": {
            "Image": eval_dict('image_score'),
            "Dialogue": eval_dict('dialogue_score'),
            "Overall": eval_dict('overall_simialrity_sum'),
            "Overall_Transform": eval_dict('overall_simialrity_sum_transform')
        },
        "Energy Sum": {
            "Image": eval_dict('energy_sum_image'),
            "Dialogue": eval_dict('energy_sum_dialogue'),
            "Overall": eval_dict('energy_overall_simialrity_sum'),
            "Overall_Transform": eval_dict('energy_overall_simialrity_sum_transform')
        },
        "Energy Max": {
            "Image": eval_dict('energy_max_image'),
            "Dialogue": eval_dict('energy_max_dialogue'),
            "Overall": eval_dict('energy_overall_simialrity_max'),
            "Overall_Transform": eval_dict('energy_overall_simialrity_max_transform')
        },
        "MSP": {
            "Image": eval_dict('msp_max_image'),
            "Dialogue": eval_dict('msp_max_dialogue'),
            "Overall": eval_dict('msp_overall_simialrity_max'),
            "Overall_Transform": eval_dict('msp_overall_simialrity_max_transform')
        },
        "Max Prob": {
            "Image": eval_dict('mp_max_image'),
            "Dialogue": eval_dict('mp_max_dialogue'),
            "Overall": eval_dict('mp_overall_simialrity_max'),
            "Overall_Transform": eval_dict('mp_overall_simialrity_max_transform')
        },
        "Max Logits": {
            "Image": eval_dict('logits_max_image'),
            "Dialogue": eval_dict('logits_max_dialogue'),
            "Overall": eval_dict('logits_overall_simialrity_max'),
            "Overall_Transform": eval_dict('logits_overall_simialrity_max_transform')
        },
        "Sum Logits": {
            "Image": eval_dict('logits_sum_image'),
            "Dialogue": eval_dict('logits_sum_dialogue'),
            "Overall": eval_dict('logits_overall_simialrity_sum'),
            "Overall_Transform": eval_dict('logits_overall_simialrity_sum_transform')
        }
    }

    # Loop through each metric and calculate values
    for score, items in metric_functions.items():
        scores.extend([score] * len(items) * 3)
        for metric, funcs in items.items():
            metrics.extend([metric] * len(funcs))
            values.extend([func(df_test) for func in funcs.values()])

    # Create DataFrame
    df = pd.DataFrame({"Metric": metrics, "Value": values, "Score": scores})
    df['Value'] = df['Value'].apply(lambda x: round(x, 3))
    #df_grouped = df.groupby('Metric')['Value'].apply(list).reset_index()
    result = df.groupby(['Metric', 'Score'])['Value'].agg(list).unstack().transpose()
    result_total[category] =  result[['Image', 'Dialogue', 'Overall', 'Overall_Transform']]
    logger.info(f"Results for OOD Category {category}:\n {result}")
    logger.info(f'Finished Processing OOD Category: {category}')

2024-05-18 18:50:00,438 - notebook_logger - INFO - Processing OOD Category: person
2024-05-18 18:50:06,727 - notebook_logger - INFO - Calculating Similarity Scores


  0%|          | 0/122218 [00:00<?, ?it/s]

  0%|          | 0/122218 [00:00<?, ?it/s]

2024-05-18 18:50:46,587 - notebook_logger - INFO - Setting random seed: 20
2024-05-18 18:50:46,637 - notebook_logger - INFO - Setting random seed: 20
  0%|          | 0/6 [00:00<?, ?it/s]2024-05-18 18:51:02,514 - notebook_logger - INFO - Epoch 1, Train Loss: 0.1531, Train Accuracy: 0.5592, Test Loss: 0.1593, Test Accuracy: 0.5478
 17%|█▋        | 1/6 [00:14<01:14, 14.84s/it]2024-05-18 18:51:18,322 - notebook_logger - INFO - Epoch 2, Train Loss: 0.1406, Train Accuracy: 0.5868, Test Loss: 0.1499, Test Accuracy: 0.5668
 33%|███▎      | 2/6 [00:30<01:01, 15.41s/it]2024-05-18 18:51:34,079 - notebook_logger - INFO - Epoch 3, Train Loss: 0.1327, Train Accuracy: 0.6038, Test Loss: 0.1463, Test Accuracy: 0.5767
 50%|█████     | 3/6 [00:46<00:46, 15.57s/it]2024-05-18 18:51:49,576 - notebook_logger - INFO - Epoch 4, Train Loss: 0.1260, Train Accuracy: 0.6172, Test Loss: 0.1447, Test Accuracy: 0.5827
 67%|██████▋   | 4/6 [01:01<00:31, 15.54s/it]2024-05-18 18:52:05,461 - notebook_logger - INFO - Ep

  0%|          | 0/122218 [00:00<?, ?it/s]

  0%|          | 0/122218 [00:00<?, ?it/s]

2024-05-18 18:54:46,249 - notebook_logger - INFO - Setting random seed: 20
2024-05-18 18:54:46,252 - notebook_logger - INFO - Setting random seed: 20
  0%|          | 0/6 [00:00<?, ?it/s]2024-05-18 18:55:00,757 - notebook_logger - INFO - Epoch 1, Train Loss: 0.1782, Train Accuracy: 0.4807, Test Loss: 0.1644, Test Accuracy: 0.4784
 17%|█▋        | 1/6 [00:13<01:07, 13.41s/it]2024-05-18 18:55:14,303 - notebook_logger - INFO - Epoch 2, Train Loss: 0.1631, Train Accuracy: 0.5076, Test Loss: 0.1534, Test Accuracy: 0.4958
 33%|███▎      | 2/6 [00:26<00:53, 13.49s/it]2024-05-18 18:55:27,856 - notebook_logger - INFO - Epoch 3, Train Loss: 0.1532, Train Accuracy: 0.5350, Test Loss: 0.1540, Test Accuracy: 0.4889
 50%|█████     | 3/6 [00:40<00:40, 13.52s/it]2024-05-18 18:55:41,484 - notebook_logger - INFO - Epoch 4, Train Loss: 0.1438, Train Accuracy: 0.5515, Test Loss: 0.1576, Test Accuracy: 0.4433
 67%|██████▋   | 4/6 [00:54<00:27, 13.56s/it]2024-05-18 18:55:55,087 - notebook_logger - INFO - Ep

  0%|          | 0/122218 [00:00<?, ?it/s]

  0%|          | 0/122218 [00:00<?, ?it/s]

2024-05-18 18:58:26,197 - notebook_logger - INFO - Setting random seed: 20
2024-05-18 18:58:26,200 - notebook_logger - INFO - Setting random seed: 20
  0%|          | 0/6 [00:00<?, ?it/s]2024-05-18 18:58:41,996 - notebook_logger - INFO - Epoch 1, Train Loss: 0.1561, Train Accuracy: 0.5535, Test Loss: 0.1718, Test Accuracy: 0.4703
 17%|█▋        | 1/6 [00:14<01:13, 14.65s/it]2024-05-18 18:58:56,442 - notebook_logger - INFO - Epoch 2, Train Loss: 0.1439, Train Accuracy: 0.5730, Test Loss: 0.1712, Test Accuracy: 0.4624
 33%|███▎      | 2/6 [00:29<00:58, 14.53s/it]2024-05-18 18:59:10,962 - notebook_logger - INFO - Epoch 3, Train Loss: 0.1358, Train Accuracy: 0.5935, Test Loss: 0.1695, Test Accuracy: 0.4693
 50%|█████     | 3/6 [00:43<00:43, 14.52s/it]2024-05-18 18:59:25,400 - notebook_logger - INFO - Epoch 4, Train Loss: 0.1279, Train Accuracy: 0.6108, Test Loss: 0.1656, Test Accuracy: 0.4834
 67%|██████▋   | 4/6 [00:58<00:28, 14.49s/it]2024-05-18 18:59:40,725 - notebook_logger - INFO - Ep

  0%|          | 0/122218 [00:00<?, ?it/s]

  0%|          | 0/122218 [00:00<?, ?it/s]

2024-05-18 19:02:20,252 - notebook_logger - INFO - Setting random seed: 20
2024-05-18 19:02:20,255 - notebook_logger - INFO - Setting random seed: 20
  0%|          | 0/6 [00:00<?, ?it/s]2024-05-18 19:02:36,239 - notebook_logger - INFO - Epoch 1, Train Loss: 0.1596, Train Accuracy: 0.5381, Test Loss: 0.1658, Test Accuracy: 0.5140
 17%|█▋        | 1/6 [00:14<01:14, 14.86s/it]2024-05-18 19:02:51,174 - notebook_logger - INFO - Epoch 2, Train Loss: 0.1425, Train Accuracy: 0.5676, Test Loss: 0.1524, Test Accuracy: 0.5377
 33%|███▎      | 2/6 [00:29<00:59, 14.91s/it]2024-05-18 19:03:06,502 - notebook_logger - INFO - Epoch 3, Train Loss: 0.1350, Train Accuracy: 0.5840, Test Loss: 0.1460, Test Accuracy: 0.5563
 50%|█████     | 3/6 [00:45<00:45, 15.10s/it]2024-05-18 19:03:21,696 - notebook_logger - INFO - Epoch 4, Train Loss: 0.1267, Train Accuracy: 0.6018, Test Loss: 0.1476, Test Accuracy: 0.5472
 67%|██████▋   | 4/6 [01:00<00:30, 15.14s/it]2024-05-18 19:03:36,850 - notebook_logger - INFO - Ep

  0%|          | 0/122218 [00:00<?, ?it/s]

  0%|          | 0/122218 [00:00<?, ?it/s]

2024-05-18 19:06:15,316 - notebook_logger - INFO - Setting random seed: 20
2024-05-18 19:06:15,319 - notebook_logger - INFO - Setting random seed: 20
  0%|          | 0/6 [00:00<?, ?it/s]2024-05-18 19:06:31,081 - notebook_logger - INFO - Epoch 1, Train Loss: 0.1527, Train Accuracy: 0.5514, Test Loss: 0.1555, Test Accuracy: 0.5453
 17%|█▋        | 1/6 [00:14<01:13, 14.73s/it]2024-05-18 19:06:45,945 - notebook_logger - INFO - Epoch 2, Train Loss: 0.1367, Train Accuracy: 0.5831, Test Loss: 0.1432, Test Accuracy: 0.5681
 33%|███▎      | 2/6 [00:29<00:59, 14.81s/it]2024-05-18 19:07:00,793 - notebook_logger - INFO - Epoch 3, Train Loss: 0.1274, Train Accuracy: 0.6007, Test Loss: 0.1382, Test Accuracy: 0.5743
 50%|█████     | 3/6 [00:44<00:44, 14.83s/it]2024-05-18 19:07:15,577 - notebook_logger - INFO - Epoch 4, Train Loss: 0.1203, Train Accuracy: 0.6188, Test Loss: 0.1352, Test Accuracy: 0.5855
 67%|██████▋   | 4/6 [00:59<00:29, 14.81s/it]2024-05-18 19:07:30,390 - notebook_logger - INFO - Ep

  0%|          | 0/122218 [00:00<?, ?it/s]

  0%|          | 0/122218 [00:00<?, ?it/s]

2024-05-18 19:10:08,365 - notebook_logger - INFO - Setting random seed: 20
2024-05-18 19:10:08,368 - notebook_logger - INFO - Setting random seed: 20
  0%|          | 0/6 [00:00<?, ?it/s]2024-05-18 19:10:24,153 - notebook_logger - INFO - Epoch 1, Train Loss: 0.1674, Train Accuracy: 0.5207, Test Loss: 0.1724, Test Accuracy: 0.5104
 17%|█▋        | 1/6 [00:14<01:13, 14.75s/it]2024-05-18 19:10:39,185 - notebook_logger - INFO - Epoch 2, Train Loss: 0.1517, Train Accuracy: 0.5515, Test Loss: 0.1601, Test Accuracy: 0.5342
 33%|███▎      | 2/6 [00:29<00:59, 14.92s/it]2024-05-18 19:10:54,183 - notebook_logger - INFO - Epoch 3, Train Loss: 0.1413, Train Accuracy: 0.5721, Test Loss: 0.1549, Test Accuracy: 0.5442
 50%|█████     | 3/6 [00:44<00:44, 14.95s/it]2024-05-18 19:11:09,127 - notebook_logger - INFO - Epoch 4, Train Loss: 0.1362, Train Accuracy: 0.5831, Test Loss: 0.1547, Test Accuracy: 0.5424
 67%|██████▋   | 4/6 [00:59<00:29, 14.95s/it]2024-05-18 19:11:24,035 - notebook_logger - INFO - Ep

  0%|          | 0/122218 [00:00<?, ?it/s]

  0%|          | 0/122218 [00:00<?, ?it/s]

2024-05-18 19:14:05,717 - notebook_logger - INFO - Setting random seed: 20
2024-05-18 19:14:05,720 - notebook_logger - INFO - Setting random seed: 20
  0%|          | 0/6 [00:00<?, ?it/s]2024-05-18 19:14:21,622 - notebook_logger - INFO - Epoch 1, Train Loss: 0.1510, Train Accuracy: 0.5508, Test Loss: 0.1557, Test Accuracy: 0.5371
 17%|█▋        | 1/6 [00:14<01:14, 14.85s/it]2024-05-18 19:14:36,715 - notebook_logger - INFO - Epoch 2, Train Loss: 0.1366, Train Accuracy: 0.5791, Test Loss: 0.1443, Test Accuracy: 0.5615
 33%|███▎      | 2/6 [00:29<00:59, 14.99s/it]2024-05-18 19:14:51,674 - notebook_logger - INFO - Epoch 3, Train Loss: 0.1296, Train Accuracy: 0.5971, Test Loss: 0.1417, Test Accuracy: 0.5699
 50%|█████     | 3/6 [00:44<00:44, 14.98s/it]2024-05-18 19:15:06,642 - notebook_logger - INFO - Epoch 4, Train Loss: 0.1239, Train Accuracy: 0.6105, Test Loss: 0.1423, Test Accuracy: 0.5733
 67%|██████▋   | 4/6 [00:59<00:29, 14.97s/it]2024-05-18 19:15:22,286 - notebook_logger - INFO - Ep

  0%|          | 0/122218 [00:00<?, ?it/s]

  0%|          | 0/122218 [00:00<?, ?it/s]

2024-05-18 19:18:01,130 - notebook_logger - INFO - Setting random seed: 20
2024-05-18 19:18:01,132 - notebook_logger - INFO - Setting random seed: 20
  0%|          | 0/6 [00:00<?, ?it/s]2024-05-18 19:18:16,789 - notebook_logger - INFO - Epoch 1, Train Loss: 0.1676, Train Accuracy: 0.5117, Test Loss: 0.1761, Test Accuracy: 0.4734
 17%|█▋        | 1/6 [00:14<01:12, 14.56s/it]2024-05-18 19:18:31,680 - notebook_logger - INFO - Epoch 2, Train Loss: 0.1496, Train Accuracy: 0.5546, Test Loss: 0.1596, Test Accuracy: 0.5152
 33%|███▎      | 2/6 [00:29<00:59, 14.75s/it]2024-05-18 19:18:46,401 - notebook_logger - INFO - Epoch 3, Train Loss: 0.1389, Train Accuracy: 0.5775, Test Loss: 0.1534, Test Accuracy: 0.5279
 50%|█████     | 3/6 [00:44<00:44, 14.74s/it]2024-05-18 19:19:01,067 - notebook_logger - INFO - Epoch 4, Train Loss: 0.1320, Train Accuracy: 0.5888, Test Loss: 0.1528, Test Accuracy: 0.5280
 67%|██████▋   | 4/6 [00:58<00:29, 14.71s/it]2024-05-18 19:19:15,819 - notebook_logger - INFO - Ep

  0%|          | 0/122218 [00:00<?, ?it/s]

  0%|          | 0/122218 [00:00<?, ?it/s]

2024-05-18 19:21:55,413 - notebook_logger - INFO - Setting random seed: 20
2024-05-18 19:21:55,416 - notebook_logger - INFO - Setting random seed: 20
  0%|          | 0/6 [00:00<?, ?it/s]2024-05-18 19:22:12,237 - notebook_logger - INFO - Epoch 1, Train Loss: 0.1449, Train Accuracy: 0.5688, Test Loss: 0.1503, Test Accuracy: 0.5424
 17%|█▋        | 1/6 [00:15<01:18, 15.72s/it]2024-05-18 19:22:27,893 - notebook_logger - INFO - Epoch 2, Train Loss: 0.1314, Train Accuracy: 0.6008, Test Loss: 0.1424, Test Accuracy: 0.5686
 33%|███▎      | 2/6 [00:31<01:02, 15.68s/it]2024-05-18 19:22:44,105 - notebook_logger - INFO - Epoch 3, Train Loss: 0.1256, Train Accuracy: 0.6098, Test Loss: 0.1401, Test Accuracy: 0.5689
 50%|█████     | 3/6 [00:47<00:47, 15.92s/it]2024-05-18 19:22:58,853 - notebook_logger - INFO - Epoch 4, Train Loss: 0.1189, Train Accuracy: 0.6283, Test Loss: 0.1417, Test Accuracy: 0.5734
 67%|██████▋   | 4/6 [01:02<00:30, 15.46s/it]2024-05-18 19:23:13,441 - notebook_logger - INFO - Ep

  0%|          | 0/122218 [00:00<?, ?it/s]

  0%|          | 0/122218 [00:00<?, ?it/s]

2024-05-18 19:25:50,609 - notebook_logger - INFO - Setting random seed: 20
2024-05-18 19:25:50,611 - notebook_logger - INFO - Setting random seed: 20
  0%|          | 0/6 [00:00<?, ?it/s]2024-05-18 19:26:06,260 - notebook_logger - INFO - Epoch 1, Train Loss: 0.1570, Train Accuracy: 0.5372, Test Loss: 0.1620, Test Accuracy: 0.5234
 17%|█▋        | 1/6 [00:14<01:13, 14.60s/it]2024-05-18 19:26:21,230 - notebook_logger - INFO - Epoch 2, Train Loss: 0.1463, Train Accuracy: 0.5560, Test Loss: 0.1549, Test Accuracy: 0.5392
 33%|███▎      | 2/6 [00:29<00:59, 14.82s/it]2024-05-18 19:26:36,131 - notebook_logger - INFO - Epoch 3, Train Loss: 0.1359, Train Accuracy: 0.5810, Test Loss: 0.1504, Test Accuracy: 0.5484
 50%|█████     | 3/6 [00:44<00:44, 14.86s/it]2024-05-18 19:26:51,001 - notebook_logger - INFO - Epoch 4, Train Loss: 0.1282, Train Accuracy: 0.5950, Test Loss: 0.1475, Test Accuracy: 0.5520
 67%|██████▋   | 4/6 [00:59<00:29, 14.86s/it]2024-05-18 19:27:06,055 - notebook_logger - INFO - Ep

  0%|          | 0/122218 [00:00<?, ?it/s]

  0%|          | 0/122218 [00:00<?, ?it/s]

2024-05-18 19:29:46,436 - notebook_logger - INFO - Setting random seed: 20
2024-05-18 19:29:46,438 - notebook_logger - INFO - Setting random seed: 20
  0%|          | 0/6 [00:00<?, ?it/s]2024-05-18 19:30:02,384 - notebook_logger - INFO - Epoch 1, Train Loss: 0.1629, Train Accuracy: 0.5260, Test Loss: 0.1665, Test Accuracy: 0.5141
 17%|█▋        | 1/6 [00:14<01:14, 14.85s/it]2024-05-18 19:30:17,273 - notebook_logger - INFO - Epoch 2, Train Loss: 0.1491, Train Accuracy: 0.5558, Test Loss: 0.1569, Test Accuracy: 0.5355
 33%|███▎      | 2/6 [00:29<00:59, 14.87s/it]2024-05-18 19:30:32,743 - notebook_logger - INFO - Epoch 3, Train Loss: 0.1406, Train Accuracy: 0.5719, Test Loss: 0.1542, Test Accuracy: 0.5409
 50%|█████     | 3/6 [00:45<00:45, 15.15s/it]2024-05-18 19:30:47,821 - notebook_logger - INFO - Epoch 4, Train Loss: 0.1336, Train Accuracy: 0.5861, Test Loss: 0.1543, Test Accuracy: 0.5410
 67%|██████▋   | 4/6 [01:00<00:30, 15.12s/it]2024-05-18 19:31:02,725 - notebook_logger - INFO - Ep

  0%|          | 0/122218 [00:00<?, ?it/s]

  0%|          | 0/122218 [00:00<?, ?it/s]

2024-05-18 19:33:56,738 - notebook_logger - INFO - Setting random seed: 20
2024-05-18 19:33:56,740 - notebook_logger - INFO - Setting random seed: 20
  0%|          | 0/6 [00:00<?, ?it/s]2024-05-18 19:34:14,264 - notebook_logger - INFO - Epoch 1, Train Loss: 0.1549, Train Accuracy: 0.5408, Test Loss: 0.1566, Test Accuracy: 0.5356
 17%|█▋        | 1/6 [00:16<01:21, 16.36s/it]2024-05-18 19:34:30,271 - notebook_logger - INFO - Epoch 2, Train Loss: 0.1435, Train Accuracy: 0.5626, Test Loss: 0.1487, Test Accuracy: 0.5486
 33%|███▎      | 2/6 [00:32<01:04, 16.15s/it]2024-05-18 19:34:46,271 - notebook_logger - INFO - Epoch 3, Train Loss: 0.1333, Train Accuracy: 0.5854, Test Loss: 0.1469, Test Accuracy: 0.5520
 50%|█████     | 3/6 [00:48<00:48, 16.08s/it]2024-05-18 19:35:01,930 - notebook_logger - INFO - Epoch 4, Train Loss: 0.1266, Train Accuracy: 0.6010, Test Loss: 0.1481, Test Accuracy: 0.5430
 67%|██████▋   | 4/6 [01:04<00:31, 15.92s/it]2024-05-18 19:35:18,595 - notebook_logger - INFO - Ep

In [38]:
result_total['animal']

Metric,Image,Dialogue,Overall,Overall_Transform
Score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Energy Max,"[0.383, 0.837, 0.819]","[0.74, 0.79, 0.799]","[0.385, 0.844, 0.846]","[0.396, 0.909, 0.933]"
Energy Sum,"[0.318, 0.846, 0.823]","[0.505, 0.82, 0.813]","[0.327, 0.848, 0.848]","[0.304, 0.913, 0.929]"
MSP,"[0.833, 0.696, 0.742]","[0.872, 0.706, 0.753]","[0.801, 0.738, 0.788]","[0.747, 0.737, 0.77]"
Max Cosine,"[0.916, 0.646, 0.708]","[0.927, 0.578, 0.65]","[0.907, 0.622, 0.688]","[0.97, 0.418, 0.544]"
Max Logits,"[0.383, 0.837, 0.819]","[0.74, 0.79, 0.799]","[0.384, 0.845, 0.847]","[0.39, 0.909, 0.932]"
Max Prob,"[0.383, 0.837, 0.819]","[0.74, 0.79, 0.799]","[0.458, 0.855, 0.859]","[0.505, 0.877, 0.902]"
Sum Cosine,"[0.944, 0.597, 0.67]","[0.926, 0.587, 0.659]","[0.917, 0.61, 0.678]","[0.969, 0.412, 0.535]"
Sum Logits,"[0.887, 0.74, 0.774]","[0.975, 0.649, 0.725]","[0.936, 0.732, 0.794]","[0.893, 0.654, 0.693]"


In [39]:

df_avg = pd.DataFrame()

metrics = result_total[next(iter(result_total))].index
sum_data = {metric: {'Image': [], 'Dialogue': [], 'Overall': [], 'Overall_Transform': []} for metric in metrics}

for dataset in result_total.values():
    for metric in metrics:
        sum_data[metric]['Image'].append(dataset['Image'][metric])
        sum_data[metric]['Dialogue'].append(dataset['Dialogue'][metric])
        sum_data[metric]['Overall'].append(dataset['Overall'][metric])
        sum_data[metric]['Overall_Transform'].append(dataset['Overall_Transform'][metric])

avg_data = {'Metric': metrics, 'Image': [], 'Dialogue': [], 'Overall': [], 'Overall_Transform': []}
for metric in metrics:
    avg_data['Image'].append(np.mean(sum_data[metric]['Image'], axis=0).round(3).tolist())
    avg_data['Dialogue'].append(np.mean(sum_data[metric]['Dialogue'], axis=0).round(3).tolist())
    avg_data['Overall'].append(np.mean(sum_data[metric]['Overall'], axis=0).round(3).tolist())
    avg_data['Overall_Transform'].append(np.mean(sum_data[metric]['Overall_Transform'], axis=0).round(3).tolist())

df_avg = pd.DataFrame(avg_data)

df_avg

Unnamed: 0,Metric,Image,Dialogue,Overall,Overall_Transform
0,Energy Max,"[0.762, 0.619, 0.825]","[0.864, 0.612, 0.825]","[0.772, 0.626, 0.843]","[0.734, 0.815, 0.941]"
1,Energy Sum,"[0.736, 0.623, 0.826]","[0.813, 0.621, 0.827]","[0.747, 0.621, 0.843]","[0.681, 0.818, 0.939]"
2,MSP,"[0.91, 0.563, 0.805]","[0.902, 0.58, 0.813]","[0.892, 0.599, 0.838]","[0.651, 0.845, 0.931]"
3,Max Cosine,"[0.936, 0.531, 0.788]","[0.947, 0.515, 0.78]","[0.942, 0.529, 0.787]","[0.956, 0.485, 0.762]"
4,Max Logits,"[0.762, 0.619, 0.825]","[0.864, 0.612, 0.825]","[0.772, 0.627, 0.844]","[0.743, 0.812, 0.94]"
5,Max Prob,"[0.762, 0.619, 0.825]","[0.864, 0.612, 0.825]","[0.789, 0.653, 0.855]","[0.462, 0.908, 0.964]"
6,Sum Cosine,"[0.949, 0.5, 0.77]","[0.948, 0.508, 0.776]","[0.946, 0.51, 0.776]","[0.956, 0.478, 0.755]"
7,Sum Logits,"[0.946, 0.562, 0.809]","[0.958, 0.547, 0.804]","[0.977, 0.574, 0.833]","[0.985, 0.385, 0.727]"


In [6]:
result_total['person']

Metric,Image,Dialogue,Overall
Score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Energy Max,"[0.939, 0.517, 0.826]","[0.947, 0.517, 0.827]","[0.915, 0.731, 0.93]"
Energy Sum,"[0.93, 0.524, 0.827]","[0.943, 0.522, 0.829]","[0.861, 0.786, 0.946]"
MSP,"[0.95, 0.507, 0.824]","[0.949, 0.51, 0.825]","[0.816, 0.838, 0.961]"
Max Cosine,"[0.947, 0.508, 0.822]","[0.944, 0.505, 0.821]","[0.943, 0.512, 0.823]"
Max Logits,"[0.939, 0.517, 0.826]","[0.947, 0.517, 0.827]","[0.929, 0.721, 0.928]"
Max Prob,"[0.939, 0.517, 0.826]","[0.947, 0.517, 0.827]","[0.731, 0.881, 0.972]"
Sum Cosine,"[0.945, 0.506, 0.821]","[0.944, 0.506, 0.821]","[0.942, 0.512, 0.822]"
Sum Logits,"[0.951, 0.505, 0.822]","[0.945, 0.501, 0.822]","[0.996, 0.24, 0.709]"


## Results on OOD

# Without Mismatching Pair, Test ID instance 23233, Test OOD instance 6051


## Without image text similarity score
| Score      | Image                 | Dialogue              | Overall               |
|:-----------|:----------------------|:----------------------|:----------------------|
| Energy Max | [0.373, 0.937, 0.983] | [0.791, 0.886, 0.972] | [0.32, 0.952, 0.988]  |
| Energy Sum | [0.236, 0.964, 0.991] | [0.508, 0.935, 0.984] | [0.151, 0.974, 0.994] |
| MSP        | [0.894, 0.625, 0.873] | [0.919, 0.68, 0.905]  | [0.885, 0.657, 0.889] |
| Max Cosine | [0.877, 0.711, 0.905] | [0.971, 0.54, 0.835]  | [0.964, 0.579, 0.857] |
| Max Logits | [0.373, 0.937, 0.983] | [0.791, 0.886, 0.972] | [0.331, 0.951, 0.988] |
| Max Prob   | [0.373, 0.937, 0.983] | [0.791, 0.886, 0.972] | [0.507, 0.927, 0.982] |
| Sum Cosine | [0.94, 0.576, 0.84]   | [0.978, 0.494, 0.814] | [0.972, 0.51, 0.817]  |
| Sum Logits | [0.904, 0.762, 0.934] | [0.974, 0.758, 0.931] | [0.932, 0.788, 0.943] |

## With image text similarity score overall
| Score      | Image                 | Dialogue              | Overall               |
|:-----------|:----------------------|:----------------------|:----------------------|
| Energy Max | [0.373, 0.937, 0.983] | [0.791, 0.886, 0.972] | [0.346, 0.945, 0.986] |
| Energy Sum | [0.236, 0.964, 0.991] | [0.508, 0.935, 0.984] | [0.194, 0.97, 0.993]  |
| MSP        | [0.894, 0.625, 0.873] | [0.919, 0.68, 0.905]  | [0.908, 0.574, 0.846] |
| Max Cosine | [0.877, 0.711, 0.905] | [0.971, 0.54, 0.835]  | [0.964, 0.579, 0.857] |
| Max Logits | [0.373, 0.937, 0.983] | [0.791, 0.886, 0.972] | [0.35, 0.945, 0.986]  |
| Max Prob   | [0.373, 0.937, 0.983] | [0.791, 0.886, 0.972] | [0.731, 0.793, 0.934] |
| Sum Cosine | [0.94, 0.576, 0.84]   | [0.978, 0.494, 0.814] | [0.972, 0.51, 0.817]  |
| Sum Logits | [0.904, 0.762, 0.934] | [0.974, 0.758, 0.931] | [0.955, 0.764, 0.934] |

# With 5000 mismatching pairs, testing ID instance 23233, test OOD isntance 11051


## Without image text similarity
| Score      | Image                 | Dialogue              | Overall               |
|:-----------|:----------------------|:----------------------|:----------------------|
| Energy Max | [0.625, 0.747, 0.815] | [0.859, 0.721, 0.81]  | [0.614, 0.758, 0.838] |
| Energy Sum | [0.546, 0.762, 0.821] | [0.698, 0.748, 0.819] | [0.522, 0.767, 0.843] |
| MSP        | [0.92, 0.57, 0.737]   | [0.932, 0.604, 0.762] | [0.921, 0.598, 0.773] |
| Max Cosine | [0.908, 0.618, 0.758] | [0.961, 0.523, 0.708] | [0.959, 0.544, 0.724] |
| Max Logits | [0.625, 0.747, 0.815] | [0.859, 0.721, 0.81]  | [0.619, 0.758, 0.839] |
| Max Prob   | [0.625, 0.747, 0.815] | [0.859, 0.721, 0.81]  | [0.707, 0.765, 0.848] |
| Sum Cosine | [0.945, 0.543, 0.712] | [0.965, 0.497, 0.693] | [0.963, 0.506, 0.695] |
| Sum Logits | [0.923, 0.65, 0.783]  | [0.963, 0.648, 0.782] | [0.953, 0.672, 0.814] |

## With imge text similarity score on overall 
| Score      | Image                 | Dialogue              | Overall               |
|:-----------|:----------------------|:----------------------|:----------------------|
| Energy Max | [0.625, 0.747, 0.815] | [0.859, 0.721, 0.81]  | [0.598, 0.862, 0.932] |
| Energy Sum | [0.546, 0.762, 0.821] | [0.698, 0.748, 0.819] | [0.511, 0.872, 0.934] |
| MSP        | [0.92, 0.57, 0.737]   | [0.932, 0.604, 0.762] | [0.752, 0.723, 0.831] |
| Max Cosine | [0.908, 0.618, 0.758] | [0.961, 0.523, 0.708] | [0.959, 0.544, 0.724] |
| Max Logits | [0.625, 0.747, 0.815] | [0.859, 0.721, 0.81]  | [0.607, 0.86, 0.931]  |
| Max Prob   | [0.625, 0.747, 0.815] | [0.859, 0.721, 0.81]  | [0.569, 0.859, 0.924] |
| Sum Cosine | [0.945, 0.543, 0.712] | [0.965, 0.497, 0.693] | [0.963, 0.506, 0.695] |
| Sum Logits | [0.923, 0.65, 0.783]  | [0.963, 0.648, 0.782] | [0.975, 0.546, 0.709] |






