In [1]:
import json
import pandas as pd
import requests
import zipfile
import numpy as np

In [2]:
url = "http://images.cocodataset.org/annotations/annotations_trainval2017.zip"

# Download the file from `url` and save it locally under `file_name` using requests
file_name = "annotations_trainval2017.zip"
with open(file_name, "wb") as f:
    response = requests.get(url)
    f.write(response.content)

In [3]:
with zipfile.ZipFile(file_name, 'r') as zip_ref:
    zip_ref.extractall(".")

In [4]:
voc_classes = [
    "aeroplane",
    "bicycle",
    "bird",
    "boat",
    "bottle",
    "bus",
    "car",
    "cat",
    "chair",
    "cow",
    "dining table",
    "dog",
    "horse",
    "motorbike",
    "person",
    "potted plant",
    "sheep",
    "sofa",
    "tv",
    "train",
]

In [5]:
captions = json.load(open('annotations/captions_val2017.json', 'r'))
annotations = json.load(open('annotations/instances_val2017.json', 'r'))

In [6]:
# motorcycle -> motorbike
# airplane -> aeroplane
# couch -> sofa
classes_mapping = {"motorcycle": "motorbike", "airplane": "aeroplane", "couch": "sofa"}
df_categories = pd.DataFrame(annotations["categories"])
df_categories.name = df_categories.name.replace(classes_mapping)
df_categories_voc = df_categories[df_categories.name.isin(voc_classes)]
assert len(df_categories_voc) == 20

In [None]:
df_annotations = pd.DataFrame(annotations["annotations"]).query("iscrowd == 0")
df_annotations = df_annotations[df_annotations.category_id.isin(df_categories_voc.id)]
ids_filtrados = df_annotations.query("area > 5000").image_id.unique()
df_counts = df_annotations[df_annotations.image_id.isin(ids_filtrados)].groupby("image_id").segmentation.count().reset_index()
ids_1_annotation = df_counts.query("segmentation <= 2")
df_counts

In [None]:

df_captions = pd.DataFrame(captions["annotations"])
df_captions = df_captions[df_captions.image_id.isin(ids_1_annotation.image_id)]
df_captions = df_captions.merge(df_annotations.rename(columns={"id": "annotation_id"}), on="image_id", how="inner")
df_captions = df_captions[["id", "image_id", "category_id", "area", "caption"]]
df_captions = df_captions.rename(columns={"id": "caption_id"})

df_captions = df_captions[df_captions.category_id.isin(df_categories_voc.id)]
df_captions = df_captions.merge(df_categories_voc[["id", "name"]], left_on="category_id", right_on="id", how="inner")
df_captions = df_captions.drop(columns=["id"])

df_images = pd.DataFrame(captions["images"])
df_images = df_images[["id", "coco_url"]]
df_captions = df_captions.merge(df_images.rename(columns={"id": "image_id"}), on="image_id", how="inner").rename(columns={"coco_url": "url", "name": "category_name"})

df_captions = df_captions.drop_duplicates(subset=["caption_id", "category_id"])
df_captions

In [None]:
# Replace theses words on the caption
df_captions.caption = df_captions.caption.str.lower()
df_captions.caption = (
    df_captions.caption.str.replace("\n", " ", regex=False)
    .str.strip()
    .str.strip(",")
    .str.strip(".")
)
# Replace synonims (taking care of spaces)
df_captions.caption = df_captions.caption.str.replace(r"motorcycle", "motorbike")
df_captions.caption = df_captions.caption.str.replace(r"airplane", "aeroplane")
df_captions.caption = df_captions.caption.str.replace(r"couch ", "sofa ")
df_captions.caption = df_captions.caption.str.replace(r" couch", " sofa")

df_captions["word_included"] = False

for category in voc_classes:
    df_captions.loc[
        df_captions.category_name == category, "word_included"
    ] = df_captions[df_captions.category_name == category].caption.str.contains(
        category
    )

# Not needed, buuut
df_captions = df_captions.drop_duplicates(subset=["caption_id", "category_id"])

# Aggregate categories of captions repeated

df_multiple = (
    df_captions.groupby("caption_id")
    .aggregate({"category_name": lambda x: "-".join(x)})
    .reset_index()
    .rename(columns={"category_name": "categories"})
)
df_captions = df_captions.merge(df_multiple, on="caption_id", how="inner")
df_captions = df_captions.drop(columns=["area"])
df_captions = df_captions[
    [
        "caption_id",
        "caption",
        "category_name",
        "category_id",
        "categories",
        "word_included",
        "image_id",
        "url",
    ]
]
df_captions.to_csv("coco_captions_full.csv", index=False)
df_captions

In [16]:
def stratify_sample(df, n_samples, seed=42):
    selected = []
    random_state = np.random.RandomState(seed)
    for category_id in df.category_id.unique():
        df_category = df[df.category_id == category_id]
        if len(df_category) < n_samples:
            selected.extend(df_category.index.tolist())
        else:
            selected.extend(df_category.sample(n_samples, random_state=random_state).index.tolist())

    return df.loc[selected]
    

df_not_included = stratify_sample(df_captions.query("word_included==False"), n_samples=30).drop_duplicates(subset=["caption_id"])
df_included = stratify_sample(df_captions.query("word_included==True"), n_samples=30)
df_sampled = pd.concat([df_included, df_not_included]).drop_duplicates(subset=["caption_id"]).drop(columns=["category_id"])
df_sampled.to_csv("coco_captions_sampled.csv", index=False)
df_sampled

Unnamed: 0,caption_id,caption,category_name,categories,word_included,image_id,url
74,404610,a close up of a motorbike parked on a dirt road,motorbike,motorbike,True,410878,http://images.cocodataset.org/val2017/00000041...
4,6803,a motorbike with its brake extended standing o...,motorbike,motorbike,True,179765,http://images.cocodataset.org/val2017/00000017...
84,301932,a motorbike is parked in a grassy field,motorbike,motorbike,True,455716,http://images.cocodataset.org/val2017/00000045...
72,273132,a motorbike is parked on a gravel road,motorbike,motorbike,True,410878,http://images.cocodataset.org/val2017/00000041...
12,82346,a black cat with arched back walking past a mo...,motorbike,motorbike-cat,True,153217,http://images.cocodataset.org/val2017/00000015...
...,...,...,...,...,...,...,...
9234,531481,a ferry in a port with mountains in the backgr...,boat,boat,False,267940,http://images.cocodataset.org/val2017/00000026...
9235,538108,a ship anchored to the bay and close to some b...,boat,boat,False,267940,http://images.cocodataset.org/val2017/00000026...
9237,538603,a small ferry docked on calm water in norway,boat,boat,False,267940,http://images.cocodataset.org/val2017/00000026...
9240,575551,a small green boa is on a lake,boat,boat,False,311518,http://images.cocodataset.org/val2017/00000031...
