In [1]:
%cd ..

/home/zarizky/projects/neural-autoregressive-object-co-occurrence


In [2]:
import json
from collections import defaultdict

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm

from utils.dataset import ObjectCooccurrenceCOCODataset

In [3]:
with open("dataset/instances_train2017.json") as file:
    data_train = json.load(file)

with open("dataset/instances_val2017.json") as file:
    data_valid = json.load(file)

In [4]:
annotations_train = data_train["annotations"]
categories_train = data_train["categories"]

annotations_valid = data_valid["annotations"]
categories_valid = data_valid["categories"]

In [5]:
cooccurences_train = {img["id"]: np.zeros(90) for img in data_train["images"]}
for annotation in tqdm(annotations_train):
    image_id = annotation["image_id"]
    category_id = annotation["category_id"]
    cooccurences_train[image_id][category_id - 1] += 1

cooccurences_valid = {img["id"]: np.zeros(90) for img in data_valid["images"]}
for annotation in tqdm(annotations_valid):
    image_id = annotation["image_id"]
    category_id = annotation["category_id"]
    cooccurences_valid[image_id][category_id - 1] += 1

  0%|          | 0/860001 [00:00<?, ?it/s]

  0%|          | 0/36781 [00:00<?, ?it/s]

In [6]:
labels = {a["id"]: f"[{a['name']}]-[{a['supercategory']}]" for a in categories_train}
columns = [labels.get(i, "[UNK]") for i in range(1, 90 + 1)]

X_train = np.fromiter(cooccurences_train.values(), dtype=(np.int32, 90))
X_valid = np.fromiter(cooccurences_valid.values(), dtype=(np.int32, 90))

df_train = pd.DataFrame(X_train, columns=columns)
df_valid = pd.DataFrame(X_valid, columns=columns)

df_train = df_train.drop(["[UNK]"], axis=1)
df_valid = df_valid.drop(["[UNK]"], axis=1)

df_train["image_id"] = cooccurences_train.keys()
df_valid["image_id"] = cooccurences_valid.keys()

df_images_train = pd.DataFrame(data_train["images"])
df_images_valid = pd.DataFrame(data_valid["images"])

df_train = df_images_train.merge(df_train, left_on="id", right_on="image_id")
df_valid = df_images_valid.merge(df_valid, left_on="id", right_on="image_id")

df_train = df_train.drop(["image_id"], axis=1)
df_test = df_valid.drop(["image_id"], axis=1)

df_train.to_csv("dataset/coco2017-cooccurences-train.csv", index=False)
df_valid.to_csv("dataset/coco2017-cooccurences-valid.csv", index=False)