In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import json 
import sys 

sys.path.insert(0, os.path.dirname(os.getcwd()))

In [3]:
COCO2014_VAL_DIR = 'dataset/coco/val2014'
COCO2014_TRAIN_DIR = 'dataset/coco/train2014'
COCO2014_ANNOTATIONS_DIR = 'dataset/coco/coo_annotations'
MMD_VAL_DIR = 'dataset/mmd/dev'
MMD_TRAIN_DIR = 'dataset/mmd/train'
MMD_TEST_DIR = 'dataset/mmd/test'
TARGET_DIR = 'dataset/mmd/sample'

In [15]:
val_json = pd.read_json(MMD_VAL_DIR + '/dev.json')
train_json = pd.read_json(MMD_TRAIN_DIR + '/train.json')
test_json = pd.read_json(MMD_TEST_DIR + '/test.json')

In [17]:
total_json = pd.concat([val_json, train_json, test_json], ignore_index=True)

In [25]:
coco_mmd = total_json[total_json['img_dataset'] == 'coco'].copy()

In [27]:
import re

def extract_pattern(s):
    match = re.search(r'COCO_(.*?)_\d+', s)
    if match:
        return match.group(1)  # 返回匹配的第一个括号中的内容
    return None  # 如果没有匹配，返回None


coco_mmd['set_source'] = coco_mmd['img_file'].apply(extract_pattern)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  coco_mmd['set_source'] = coco_mmd['img_file'].apply(extract_pattern)


In [31]:
coco_mmd = coco_mmd.drop(columns=['replaced_idx', 'img_idx'])

In [33]:
def create_complex_index(s):
    parts = s.split('_')
    if 'train' in parts[1]:
        type_code = 1 
    elif 'val' in parts[1]:
        type_code = 2 
    else:
        type_code = 3 
    id_str = parts[2][:-4] 
    index = int(f"{type_code}{id_str}")
    return index

coco_mmd['image_id'] = coco_mmd['img_file'].apply(create_complex_index)


In [38]:
import os
import shutil

In [39]:
def process_and_copy_image(row):
    if row['set_source'] == 'val2014':
        source_dir = COCO2014_VAL_DIR
    elif row['set_source'] == 'train2014':
        source_dir = COCO2014_TRAIN_DIR
    else:
        raise ValueError("Unknown set_source")
    
    new_index = row['image_id']
    new_filename = f"{new_index}.jpg"
    
    source_path = os.path.join(source_dir, row['img_file'])
    target_path = os.path.join(TARGET_DIR, new_filename)
    
    shutil.copy(source_path, target_path)

In [68]:
import json 


coco_label_val = json.load(open(COCO2014_ANNOTATIONS_DIR + '/instances_val2014.json'))
coco_label_train = json.load(open(COCO2014_ANNOTATIONS_DIR + '/instances_train2014.json'))

In [69]:
coco_label_val_ann = pd.DataFrame(coco_label_val['annotations'])
coco_label_train_ann = pd.DataFrame(coco_label_train['annotations'])
label_df = pd.DataFrame(coco_label_val['categories'])
label_df = label_df.rename(columns={'id': 'category_id'})

In [70]:
coco_label_val_ann = coco_label_val_ann.merge(label_df, on='category_id')
val_grouped_categories = coco_label_val_ann.groupby('image_id')['name'].unique().apply(list).reset_index()
val_grouped_upercategories = coco_label_val_ann.groupby('image_id')['supercategory'].unique().reset_index()
val_categories = val_grouped_categories.merge(val_grouped_upercategories, on='image_id')
val_categories['set_source'] = 'val2014'

In [71]:
coco_label_train_ann = coco_label_train_ann.merge(label_df, on='category_id')
train_grouped_categories = coco_label_train_ann.groupby('image_id')['name'].unique().apply(list).reset_index()
train_grouped_upercategories = coco_label_train_ann.groupby('image_id')['supercategory'].unique().reset_index()
train_categories = train_grouped_categories.merge(train_grouped_upercategories, on='image_id')
train_categories['set_source'] = 'train2014'


In [73]:
overall_categories = pd.concat([val_categories, train_categories], ignore_index=True)

In [75]:
overall_categories = overall_categories.rename(columns={'name': 'categories', 'supercategory': 'supercategories'})

In [78]:
def create_img_index(s):
    parts = s.split('_')
    id_str = parts[2][:-4] 
    index = int(id_str)
    return index

coco_mmd['image_idx'] = coco_mmd['img_file'].apply(create_img_index)

In [81]:
overall_categories = overall_categories.rename(columns={'image_id': 'image_idx'})

In [83]:
coco_mmd = coco_mmd.merge(overall_categories, on=['image_idx', 'set_source'])

In [90]:
coco_mmd.to_json(TARGET_DIR + '/coco_mmd.json', orient='records')

In [92]:
coco_mmd['dialog'].iloc[0]

['hello , how are you tonight ?',
 'i am tired . i played all day long .',
 'what did you play ? i walked dogs for nine miles total .',
 'i played with my toys in my room .',
 'do you like dogs ? i have three .',
 'yes . i like dogs . what kind are they ?',
 'i have a lab and a mixed breed .',
 'my best friend has a cat . she lives next door to me .',
 'i like cats . do you play with her a lot ?',
 'yes . we play with our dolls together . how old are you ?',
 'i am 26 . do you need a babysitter ?',
 'i am only four years old . i do not like my babysitter .',
 'tell your parents . you should always like the one who is watching you .',
 'she gets mad at me when i do not clean my room .']

In [91]:
coco_mmd

Unnamed: 0,dialog,score,dialog_dataset,dialog_file,img_dataset,img_file,set_source,image_id,image_idx,categories,supercategories
0,"[hello , how are you tonight ?, i am tired . i...",0.580191,persona,chat10005.txt,coco,COCO_val2014_000000283210.jpg,val2014,2000000283210,283210,"[dog, chair]","[animal, furniture]"
1,"[good morning how old are you i am 34, i am mu...",0.556051,persona,chat10043.txt,coco,COCO_val2014_000000283210.jpg,val2014,2000000283210,283210,"[dog, chair]","[animal, furniture]"
2,"[hello , how are you this evening ?, hello how...",0.553968,persona,chat10044.txt,coco,COCO_val2014_000000283210.jpg,val2014,2000000283210,283210,"[dog, chair]","[animal, furniture]"
3,"[hello , how are you this morning ?, not good ...",0.575316,persona,chat10050.txt,coco,COCO_val2014_000000283210.jpg,val2014,2000000283210,283210,"[dog, chair]","[animal, furniture]"
4,"[hey how are you today ?, i am great . you hav...",0.601362,persona,chat10061.txt,coco,COCO_val2014_000000283210.jpg,val2014,2000000283210,283210,"[dog, chair]","[animal, furniture]"
...,...,...,...,...,...,...,...,...,...,...,...
16975,"[My minibar is completely empty ., Everything ...",0.585859,dailydialog,train9890.txt,coco,COCO_val2014_000000298290.jpg,val2014,2000000298290,298290,"[refrigerator, bottle]","[appliance, kitchen]"
16976,"[Well , here is your breakfast !, Thanks so mu...",0.568436,dailydialog,train9914.txt,coco,COCO_val2014_000000397041.jpg,val2014,2000000397041,397041,"[sandwich, dining table, fork, cup, spoon, per...","[food, furniture, kitchen, person]"
16977,"[Miss Liu , please come in ., What can I do fo...",0.562611,dailydialog,train9939.txt,coco,COCO_val2014_000000274398.jpg,val2014,2000000274398,274398,[knife],[kitchen]
16978,"[May , look , what's that in the cage ?, Don't...",0.556105,dailydialog,train995.txt,coco,COCO_val2014_000000439326.jpg,val2014,2000000439326,439326,"[bear, cake]","[animal, food]"
