In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import json 
import sys 
import re
import shutil
import json 

sys.path.insert(0, os.path.dirname(os.getcwd()))
current_dir = os.getcwd()
dataset_dir = os.path.join(os.path.dirname(current_dir), 'dataset')
if not os.path.exists(dataset_dir):
    os.makedirs(dataset_dir)
else:
    print('dataset directory already exists.')

# This notebook is for creating the real OOD MMD dataset

In [2]:
COCO2014_VAL_DIR = f'{dataset_dir}/coco/val2014'
COCO2014_TRAIN_DIR = f'{dataset_dir}/coco/train2014'
COCO2014_ANNOTATIONS_DIR = f'{dataset_dir}/coco/coo_annotations'
VISDIAL_DIR = f'{dataset_dir}/visdial'
VISDIAL_ANNOTATIONS_DIR = f'{dataset_dir}/coco_annotations'
TARGET_DIR = f'{dataset_dir}/qa'
TARGET_IMAGE_DIR = f'{dataset_dir}/qa/images'

if not os.path.exists(TARGET_DIR):
    os.makedirs(TARGET_DIR)
    os.makedirs(TARGET_IMAGE_DIR)
    

def process_and_copy_image(row):
    if row['set_source'] == 'val2014':
        source_dir = COCO2014_VAL_DIR
    elif row['set_source'] == 'train2014':
        source_dir = COCO2014_TRAIN_DIR
    else:
        print(f"Unknown image source: {row['set_source']}, {row['image_id']}")
        return False
    
    try:
        new_index = row['image_id']
        new_filename = f"{new_index}.jpg"
        
        source_path = os.path.join(source_dir, row['file_name'])
        target_path = os.path.join(TARGET_IMAGE_DIR, new_filename)
        
        shutil.copy(source_path, target_path)
        return True
    except Exception as e:
        print(e)
        return False

In [7]:
train_dialog = f'{VISDIAL_DIR}/visdial_0.9_train.json'
val_dialog = f'{VISDIAL_DIR}/visdial_0.9_val.json'
coco_label_val = json.load(open(COCO2014_ANNOTATIONS_DIR + '/instances_val2014.json'))
coco_label_train = json.load(open(COCO2014_ANNOTATIONS_DIR + '/instances_train2014.json'))

with open(train_dialog, 'r') as f:
    train_data = json.load(f)
with open(val_dialog, 'r') as f:
    val_data = json.load(f)
    
train_question_list = train_data['data']['questions']
val_question_list = val_data['data']['questions']
train_answer_list = train_data['data']['answers']
val_answer_list = val_data['data']['answers']

train_df = pd.DataFrame(train_data['data']['dialogs'])
val_df = pd.DataFrame(val_data['data']['dialogs'])

def generate_train_diag(row):
    diag = ""
    for i in row['dialog']:
        diag += "Q: " + train_question_list[i['question']] + '\n' + "A: " + train_answer_list[i['answer']] + '\n'
    return diag

def generate_val_diag(row):
    diag = ""
    for i in row['dialog']:
        diag += "Q: " + val_question_list[i['question']] + '\n' + "A: " + val_answer_list[i['answer']] + '\n'
    return diag

train_df['dialog_full'] = train_df.apply(generate_train_diag, axis=1)
val_df['dialog_full'] = val_df.apply(generate_val_diag, axis=1)

train_df['dialog_full'] = train_df.apply(generate_train_diag, axis=1)
val_df['dialog_full'] = val_df.apply(generate_val_diag, axis=1)

train_df['file_name'] = train_df['image_id'].apply(lambda x: f'COCO_train2014_{str(x).zfill(12)}.jpg')
val_df['file_name'] = val_df['image_id'].apply(lambda x: f'COCO_val2014_{str(x).zfill(12)}.jpg')

coco_label_val_ann = pd.DataFrame(coco_label_val['annotations'])
coco_label_train_ann = pd.DataFrame(coco_label_train['annotations'])

label_df = pd.DataFrame(coco_label_val['categories'])
label_df = label_df.rename(columns={'id': 'category_id'})

# Merge the labels with the annotations
coco_label_val_ann = coco_label_val_ann.merge(label_df, on='category_id')
val_grouped_categories = coco_label_val_ann.groupby('image_id')['name'].unique().apply(list).reset_index()
val_grouped_supercategories = coco_label_val_ann.groupby('image_id')['supercategory'].unique().reset_index()
val_categories = val_grouped_categories.merge(val_grouped_supercategories, on='image_id')
val_categories['set_source'] = 'val2014'

# Merge the labels with the annotations
coco_label_train_ann = coco_label_train_ann.merge(label_df, on='category_id')
train_grouped_categories = coco_label_train_ann.groupby('image_id')['name'].unique().apply(list).reset_index()
train_grouped_supercategories = coco_label_train_ann.groupby('image_id')['supercategory'].unique().reset_index()
train_categories = train_grouped_categories.merge(train_grouped_supercategories, on='image_id')
train_categories['set_source'] = 'train2014'

train_df = train_df.merge(train_categories, left_on='image_id', right_on='image_id', how='left')
val_df = val_df.merge(val_categories, left_on='image_id', right_on='image_id', how='left')

overall_df = pd.concat([train_df, val_df], axis=0)
overall_df = overall_df[overall_df['set_source'].notnull()]
overall_df.to_json(TARGET_DIR + '/sample.json', orient='records')
overall_df['copy_status'] = overall_df.apply(process_and_copy_image, axis=1)