In [3]:
import os
import json

import pandas as pd
import numpy as np

### should be modified properly. ###
data_root = "../../../../../datasets"
####################################

In [4]:
df = pd.read_csv(os.path.join(data_root, "shutterstock", "shutterstock.csv"), sep="\t", names=["url", "caption"])
df.head()

Unnamed: 0,url,caption
0,https://image.shutterstock.com/image-vector/ha...,half filled goblet with juice\t
1,https://image.shutterstock.com/image-illustrat...,Simple Boy and Girl. Icon. Imitation draw wit...
2,https://image.shutterstock.com/image-photo/sil...,Silhouette of airplane in sky. Mixed media . M...
3,https://image.shutterstock.com/image-illustrat...,to paint a wall
4,https://image.shutterstock.com/image-vector/ab...,Abstract seamless pattern painted in colors of...


In [5]:
all_data = []

for i, (url, caption) in enumerate(zip(df["url"].values, df["caption"].values)):
    if (i+1) % 1000000 == 0:
        print(f"Processing... {i+1}/{len(df)}")
    image_type = url.split(".com/")[1].split("/")[0]
    if "image" not in image_type:
        continue
    if "photo" not in image_type:
        continue
    image_id = int(url.split("-")[-1].split(".jpg")[0])
    if image_id >= 215268000 and image_id <= 1908117000:
        all_data.append({"url": url, "image_id": image_id, "caption": str(caption).rstrip()})

Processing... 1000000/15001000
Processing... 2000000/15001000
Processing... 3000000/15001000
Processing... 4000000/15001000
Processing... 5000000/15001000
Processing... 6000000/15001000
Processing... 7000000/15001000
Processing... 8000000/15001000
Processing... 9000000/15001000
Processing... 10000000/15001000
Processing... 11000000/15001000
Processing... 12000000/15001000
Processing... 13000000/15001000
Processing... 14000000/15001000
Processing... 15000000/15001000


In [6]:
len(all_data)

6784940

In [7]:
all_ids = [data["image_id"] for data in all_data]

In [8]:
nice_val = json.load(open(os.path.join(data_root, "nice", "nice_val.json"), "r"))
val_ids = [int(im["image"].split("/")[-1].split(".jpg")[0]) for im in nice_val]

In [9]:
hist, bins = np.histogram(val_ids, bins=200)

In [10]:
all_hist, all_bins = np.histogram(all_ids, bins=bins)

In [11]:
drop_inds = np.where(hist == 0)[0]
all_hist[drop_inds] = 0

In [12]:
all_hist.sum()

1105945

In [13]:
hist_inds = np.where(hist > 0)[0]

In [14]:
intervals = []
for hist_ind in hist_inds:
    intervals.append((int(bins[hist_ind]), int(bins[hist_ind+1])))

In [15]:
all_ids_np = np.asarray(all_ids)

In [16]:
filter_inds = []
for interval in intervals:
    left, right = interval
    subset_inds = np.bitwise_and(all_ids_np >= left, all_ids_np < right)
    subset = all_ids_np[subset_inds]
    filter_inds.extend(subset.tolist())
    print(f"left: {left}, right: {right}, len: {len(subset)}")

left: 215268662, right: 223732901, len: 40560
left: 401481938, right: 409946178, len: 39191
left: 1256370162, right: 1264834402, len: 43412
left: 1273298642, right: 1281762881, len: 40898
left: 1290227121, right: 1298691361, len: 41220
left: 1298691361, right: 1307155601, len: 41767
left: 1476440398, right: 1484904638, len: 33722
left: 1493368877, right: 1501833117, len: 33791
left: 1527225837, right: 1535690077, len: 30797
left: 1552618556, right: 1561082796, len: 32403
left: 1561082796, right: 1569547036, len: 32959
left: 1569547036, right: 1578011276, len: 32392
left: 1578011276, right: 1586475516, len: 31642
left: 1586475516, right: 1594939755, len: 30889
left: 1637260955, right: 1645725194, len: 30123
left: 1645725194, right: 1654189434, len: 29788
left: 1654189434, right: 1662653674, len: 30358
left: 1662653674, right: 1671117914, len: 29973
left: 1671117914, right: 1679582154, len: 30401
left: 1679582154, right: 1688046394, len: 29879
left: 1704974873, right: 1713439113, len: 30

In [17]:
filter_inds_set = set(filter_inds)
new_all_data = []
for data in all_data:
    if data["image_id"] in filter_inds_set:
        new_all_data.append(data)

In [18]:
len(new_all_data)

1105945

In [19]:
new_df = pd.DataFrame(new_all_data)
new_df.head()

Unnamed: 0,url,image_id,caption
0,https://image.shutterstock.com/image-photo/mul...,1300888363,Mule Deer with antlers in Palo Duro Canyon Sta...
1,https://image.shutterstock.com/image-photo/pot...,1861243240,"Potato Pancakes Apple Puree, Nuts and Cinnamon..."
2,https://image.shutterstock.com/image-photo/blu...,1307047705,Blue Sky Trees
3,https://image.shutterstock.com/image-photo/jeo...,1257901573,"JEONJU, KOREA - NOV, 15, 2018: Beautiful Jeonj..."
4,https://image.shutterstock.com/image-photo/tha...,1855531294,Thanksgiving background with decorative pumpki...


In [20]:
new_df.to_csv(os.path.join(data_root, "shutterstock", "shutterstock_filtered.csv"), sep="\t")

### Annotations

- Execute download_shutterstock.py as following:
```
python nice/datasets/download_scripts/DownloadShutterstockCaptions/download_shutterstock.py --data_path ${DATA_PATH}
```

- This results in `downloaded_shutterstock_report.tar.gz` on `${DATA_PATH}/shutterstock` directory with corresponding images.

In [21]:
report_dir = os.path.join(data_root, "shutterstock", "downloaded_shutterstock_report.tsv.gz")
report = pd.read_csv(report_dir, sep="\t", header=None )#, names=["caption", "image", "dataset", "type", "status", "url"])
report.head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,Mule Deer with antlers in Palo Duro Canyon Sta...,../datasets/shutterstock/images/1300888363.jpg,shutterstock,1300888363,image/jpeg,44588.0,200,https://image.shutterstock.com/image-photo/mul...
1,"Potato Pancakes Apple Puree, Nuts and Cinnamon...",../datasets/shutterstock/images/1861243240.jpg,shutterstock,1861243240,image/jpeg,30718.0,200,https://image.shutterstock.com/image-photo/pot...
2,Blue Sky Trees,../datasets/shutterstock/images/1307047705.jpg,shutterstock,1307047705,image/jpeg,20551.0,200,https://image.shutterstock.com/image-photo/blu...
3,"JEONJU, KOREA - NOV, 15, 2018: Beautiful Jeonj...",../datasets/shutterstock/images/1257901573.jpg,shutterstock,1257901573,image/jpeg,53134.0,200,https://image.shutterstock.com/image-photo/jeo...
4,Thanksgiving background with decorative pumpki...,../datasets/shutterstock/images/1855531294.jpg,shutterstock,1855531294,image/jpeg,26919.0,200,https://image.shutterstock.com/image-photo/tha...


In [22]:
new_anns = []
for i, item in report.iterrows():
    if (i+1) % 100000 == 0:
        print(f"{i+1}/{len(report)}")
    if item[6] != 200:  # exclude samples failed to download
        continue
    image_id = int(item[3])
    img_path = os.path.join("images", f"{image_id}.jpg")
    ann = {"caption": item[0], "image": img_path}
    new_anns.append(ann)


100000/1105945
200000/1105945
300000/1105945
400000/1105945
500000/1105945
600000/1105945
700000/1105945
800000/1105945
900000/1105945
1000000/1105945
1100000/1105945


In [23]:
print("Successfully downloaded {} shutterstock samples out of {}".format(len(new_anns), len(new_df)))

Successfully downloaded 1090070 shutterstock samples out of 1105945


In [24]:
with open(os.path.join(data_root, "shutterstock", "shutterstock_1m.json"), "w") as f:
    json.dump(new_anns, f)