## Notebook for real MMD OOD dataset creation

To create the datasets for real MMD OOD data, please follow the procedures listed below:
1. **\[Data Sources\]** Download dataset from [MMD dataset](https://github.com/shh1574/multi-modal-dialogue-dataset/) with the [link](https://drive.google.com/drive/folders/12-Zz4MJTASJVlbncpSWvBVqLDe5_m5QU), also download COCO2014 dataset(https://cocodataset.org/#home) and put them under `dataset/mmd` and `dataset/coco`, respectively. The COCO data structure will be looking like below:
```
├─── dataset                    <- Main dataset folder
│   ├─── coco                   <- COCO2014 Dataset
│   │    ├─── coco_annotations  <- image annotations
│   │    ├─── train2014         <- train images
│   │    ├─── val2014           <- val images
```
The mmd data structure will be as follows:
```
├─── dataset                    <- Main dataset folder
│   ├─── mmd                    <- MMD Dataset
│   │    ├─── dev               <- validation split
│   │    ├─── test              <- test split
│   │    ├─── train             <- train split
│   │    ├─── sample            <- processed target dir
```
2. **\[Dataset Generation\]** Run this notebook to create the Real OOD dataset. Then the dialogue and label data can be found under `dataset/realmmd/sample.json` and the corresponding images can be found under the directory `dataset/realmmd/images`.



In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import json 
import sys 
import re
import shutil
import json 

sys.path.insert(0, os.path.dirname(os.getcwd()))
current_dir = os.getcwd()
dataset_dir = os.path.join(os.path.dirname(current_dir), 'dataset')
if not os.path.exists(dataset_dir):
    os.makedirs(dataset_dir)
else:
    print('dataset directory already exists.')

dataset directory already exists.


In [2]:
COCO2014_VAL_DIR = f'{dataset_dir}/coco/val2014'
COCO2014_TRAIN_DIR = f'{dataset_dir}/coco/train2014'
COCO2014_ANNOTATIONS_DIR = f'{dataset_dir}/coco/coco_annotations'
MMD_VAL_DIR = f'{dataset_dir}/mmd/dev'
MMD_TRAIN_DIR = f'{dataset_dir}/mmd/train'
MMD_TEST_DIR = f'{dataset_dir}/mmd/test'
TARGET_DIR = f'{dataset_dir}/realmmd'
TARGET_IMAGE_DIR = f'{dataset_dir}/realmmd/images'

if not os.path.exists(TARGET_DIR):
    os.makedirs(TARGET_DIR)
    os.makedirs(TARGET_IMAGE_DIR)

def extract_pattern(s):
    match = re.search(r'COCO_(.*?)_\d+', s)
    if match:
        return match.group(1)  
    return None  

def create_complex_index(s):
    parts = s.split('_')
    if 'train' in parts[1]:
        type_code = 1 
    elif 'val' in parts[1]:
        type_code = 2 
    else:
        type_code = 3 
    id_str = parts[2][:-4] 
    index = int(f"{type_code}{id_str}")
    return index

def process_and_copy_image(row):
    if row['set_source'] == 'val2014':
        source_dir = COCO2014_VAL_DIR
    elif row['set_source'] == 'train2014':
        source_dir = COCO2014_TRAIN_DIR
    else:
        raise ValueError("Unknown set_source")
    
    try:
        new_index = row['image_id']
        new_filename = f"{new_index}.jpg"
        
        source_path = os.path.join(source_dir, row['img_file'])
        target_path = os.path.join(TARGET_IMAGE_DIR, new_filename)
        
        shutil.copy(source_path, target_path)
        return True
    except Exception as e:
        print(e)
        return False
    
def create_img_index(s):
    parts = s.split('_')
    id_str = parts[2][:-4] 
    index = int(id_str)
    return index


In [32]:
# Read Json files for annotations
val_json = pd.read_json(MMD_VAL_DIR + '/dev.json')
train_json = pd.read_json(MMD_TRAIN_DIR + '/train.json')
test_json = pd.read_json(MMD_TEST_DIR + '/test.json')

# Combine all json files to get the total json
total_json = pd.concat([val_json, train_json, test_json], ignore_index=True)
coco_mmd = total_json[total_json['img_dataset'] == 'coco'].copy()
coco_mmd['set_source'] = coco_mmd['img_file'].apply(extract_pattern)
coco_mmd = coco_mmd.drop(columns=['img_idx'])
coco_mmd['image_id'] = coco_mmd['img_file'].apply(create_complex_index)
coco_label_val = json.load(open(COCO2014_ANNOTATIONS_DIR + '/instances_val2014.json'))
coco_label_train = json.load(open(COCO2014_ANNOTATIONS_DIR + '/instances_train2014.json'))
coco_label_val_ann = pd.DataFrame(coco_label_val['annotations'])
coco_label_train_ann = pd.DataFrame(coco_label_train['annotations'])
label_df = pd.DataFrame(coco_label_val['categories'])
label_df = label_df.rename(columns={'id': 'category_id'})

# Merge the labels with the annotations
coco_label_val_ann = coco_label_val_ann.merge(label_df, on='category_id')
val_grouped_categories = coco_label_val_ann.groupby('image_id')['name'].unique().apply(list).reset_index()
val_grouped_supercategories = coco_label_val_ann.groupby('image_id')['supercategory'].unique().reset_index()
val_categories = val_grouped_categories.merge(val_grouped_supercategories, on='image_id')
val_categories['set_source'] = 'val2014'

# Merge the labels with the annotations
coco_label_train_ann = coco_label_train_ann.merge(label_df, on='category_id')
train_grouped_categories = coco_label_train_ann.groupby('image_id')['name'].unique().apply(list).reset_index()
train_grouped_supercategories = coco_label_train_ann.groupby('image_id')['supercategory'].unique().reset_index()
train_categories = train_grouped_categories.merge(train_grouped_supercategories, on='image_id')
train_categories['set_source'] = 'train2014'

# Combine the categories
overall_categories = pd.concat([val_categories, 
                                train_categories], 
                               ignore_index=True)

overall_categories = overall_categories.rename(columns={'name': 'categories', 
                                                        'supercategory': 'supercategories'})

# Merge the categories with the coco_mmd and copy the images
coco_mmd['image_idx'] = coco_mmd['img_file'].apply(create_img_index)
overall_categories = overall_categories.rename(columns={'image_id': 'image_idx'})
coco_mmd = coco_mmd.merge(overall_categories, on=['image_idx', 'set_source'])
coco_mmd['copy_status'] = coco_mmd.apply(process_and_copy_image, axis=1)

def extract_dialogue(row, diglog):
    idx = row['replaced_idx']
    start = max(0, idx - 1)
    end = min(len(diglog), idx + 2)
    return "\n".join(diglog[start:end])


coco_mmd['truncated_dialogue'] = coco_mmd.apply(lambda row: extract_dialogue(row, row['dialog']), axis=1)
coco_mmd.to_json(TARGET_DIR + '/sample.json', orient='records')

In [33]:
coco_mmd

Unnamed: 0,dialog,replaced_idx,score,dialog_dataset,dialog_file,img_dataset,img_file,set_source,image_id,image_idx,categories,supercategories,copy_status,truncated_dialogue
0,"[hello , how are you tonight ?, i am tired . i...",4,0.580191,persona,chat10005.txt,coco,COCO_val2014_000000283210.jpg,val2014,2000000283210,283210,"[dog, chair]","[animal, furniture]",True,i played with my toys in my room .\ndo you lik...
1,"[good morning how old are you i am 34, i am mu...",2,0.556051,persona,chat10043.txt,coco,COCO_val2014_000000283210.jpg,val2014,2000000283210,283210,"[dog, chair]","[animal, furniture]",True,i am much older than that . my grandchildren a...
2,"[hello , how are you this evening ?, hello how...",4,0.553968,persona,chat10044.txt,coco,COCO_val2014_000000283210.jpg,val2014,2000000283210,283210,"[dog, chair]","[animal, furniture]",True,"i am chillin , do you have any pets ?\ni have ..."
3,"[hello , how are you this morning ?, not good ...",12,0.575316,persona,chat10050.txt,coco,COCO_val2014_000000283210.jpg,val2014,2000000283210,283210,"[dog, chair]","[animal, furniture]",True,i used to ride bikes with my kids alot . do yo...
4,"[hey how are you today ?, i am great . you hav...",3,0.601362,persona,chat10061.txt,coco,COCO_val2014_000000283210.jpg,val2014,2000000283210,283210,"[dog, chair]","[animal, furniture]",True,"horses on the farm and you\nthree dogs , i hav..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16975,"[My minibar is completely empty ., Everything ...",4,0.585859,dailydialog,train9890.txt,coco,COCO_val2014_000000298290.jpg,val2014,2000000298290,298290,"[refrigerator, bottle]","[appliance, kitchen]",True,Is there anything special you'd like ?\nJust b...
16976,"[Well , here is your breakfast !, Thanks so mu...",4,0.568436,dailydialog,train9914.txt,coco,COCO_val2014_000000397041.jpg,val2014,2000000397041,397041,"[sandwich, dining table, fork, cup, spoon, per...","[food, furniture, kitchen, person]",True,"Oh yeah . Here , I will just trade with him .\..."
16977,"[Miss Liu , please come in ., What can I do fo...",2,0.562611,dailydialog,train9939.txt,coco,COCO_val2014_000000274398.jpg,val2014,2000000274398,274398,[knife],[kitchen],True,"What can I do for you , sir ?\nCopy this repor..."
16978,"[May , look , what's that in the cage ?, Don't...",8,0.556105,dailydialog,train995.txt,coco,COCO_val2014_000000439326.jpg,val2014,2000000439326,439326,"[bear, cake]","[animal, food]",True,"You sound quite knowledgeable .\nNot really , ..."
