In [72]:
"""
Prepare a subset of the LAION dataset: extract valid images and save corresponding metadata
"""

In [None]:
import os
import pandas as pd
import subprocess
import glob
import datetime
import shutil

In [58]:
ENTITY_COUNT = 20000

FULL_LAION_PATH = "/Users/yavuz/data/part-00000-5b54c5d5-bbcf-484d-a2ce-0d6f73df1a36-c000.snappy.parquet"
PREP_DATASET_PATH = f"/Users/yavuz/data/LAION-{ENTITY_COUNT}/"

if os.path.exists(PREP_DATASET_PATH):
    print(f"Warning: {PREP_DATASET_PATH} exists!")
else:
    os.makedirs(PREP_DATASET_PATH)

IMAGES_PATH = PREP_DATASET_PATH + "images"
URLS_PATH = PREP_DATASET_PATH + "urls.txt"
SUCCEEDED_URLS_PATH = PREP_DATASET_PATH + "succeeded-urls.txt"
DATA_PATH = PREP_DATASET_PATH + "metadata.parquet"



In [59]:
def read_safe_data(path: str, count:int) -> pd.DataFrame:
    """
    Return non-nsfw entries from the full LAION dataset.
    """
    print(f"Reading {count} items from full LAION dataset...")
    df = pd.read_parquet(path)[:count]
    
    nsfw_removed_data = df[df["NSFW"]=="UNLIKELY"]
    print("Size after removing NSFW:", len(nsfw_removed_data))
    
    clean_url_data = nsfw_removed_data[~nsfw_removed_data['URL'].str.contains(',')]
    print("Size after removing URLs with commas:", len(clean_url_data))

    return clean_url_data

In [60]:
data = read_safe_data(FULL_LAION_PATH, ENTITY_COUNT)
data

Reading 20000 items from full LAION dataset...
Size after removing NSFW: 18614
Size after removing URLs with commas: 18162


Unnamed: 0,SAMPLE_ID,URL,TEXT,HEIGHT,WIDTH,LICENSE,NSFW,similarity
1,1.060015e+12,https://thumbs.ebaystatic.com/images/g/DYEAAOS...,Silverline Air Framing Nailer 90mm 10 - 12 Gau...,225.0,225.0,?,UNLIKELY,0.312485
2,3.372497e+12,https://farm1.staticflickr.com/784/40182677504...,Anhui Mountains,800.0,514.0,?,UNLIKELY,0.316512
3,3.820200e+11,https://t2.ftcdn.net/jpg/00/58/35/35/240_F_583...,Acute pain in a woman knee,257.0,240.0,?,UNLIKELY,0.344278
5,2.179119e+12,https://i.pinimg.com/236x/03/38/05/0338055833e...,Essentials Barnwood 70-inch TV Media Stand,236.0,236.0,?,UNLIKELY,0.332799
6,1.476880e+11,http://d25hqtnqp5nl24.cloudfront.net/images/pr...,Actimel vanilla - 8x100g Brand Price Match - C...,140.0,140.0,?,UNLIKELY,0.401302
...,...,...,...,...,...,...,...,...
19995,1.530866e+12,https://www.digsdigs.com/photos/sweet-shabby-c...,Http Www Digsdigs Com 33 Sweet Shabby Chic Bed...,480.0,486.0,?,UNLIKELY,0.352294
19996,1.063230e+11,http://www2.hbdirect.com/coverm/90/2432290.jpg,Rusconi: Revolution *,170.0,170.0,?,UNLIKELY,0.371127
19997,4.247173e+12,https://goalstudio.com/web/product/medium/2020...,TOTTENHAM 940 BALL CAP - GOLD,1100.0,1460.0,?,UNLIKELY,0.388634
19998,1.505120e+11,https://thumbs.dreamstime.com/m/clown-toy-colo...,Clown toy color vector illustration Royalty Fr...,92.0,130.0,?,UNLIKELY,0.349995


In [61]:
def write_urls(data: pd.DataFrame, path: str) -> None:
    """
    Writes the URLs found in the dataframe to a file in the given path
    """
    with open(path, "w+") as f:
        for url in data["URL"]:
            f.write(url + "\n")
    print(f"Finished writing {len(data)} URLs to {path}")

write_urls(data, URLS_PATH)

Finished writing 18162 URLs to /Users/yavuz/data/LAION-20000/urls.txt


In [62]:
def download_images(url_path: str, images_path: str):
    """
    download images from text file with list of urls 
    """
    if os.path.exists(images_path):
        print(f"Warning: {images_path} exists - renaming it...!")
        os.rename(IMAGES_PATH, IMAGES_PATH + datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S"))
        
    subprocess.call(["img2dataset", "--url_list="+url_path, "--output_folder="+images_path, "--thread_count=64", "--image_size=256"])

In [63]:
download_images(URLS_PATH, IMAGES_PATH)

Starting the downloading of this file
Sharding file number 1 of 1 called /Users/yavuz/data/LAION-20000/urls.txt


2it [02:45, 82.59s/it]


worker  - success: 0.707 - failed to download: 0.274 - failed to resize: 0.019 - images per sec: 114 - count: 10000
total   - success: 0.707 - failed to download: 0.274 - failed to resize: 0.019 - images per sec: 114 - count: 10000
worker  - success: 0.700 - failed to download: 0.281 - failed to resize: 0.019 - images per sec: 108 - count: 8162
total   - success: 0.704 - failed to download: 0.277 - failed to resize: 0.019 - images per sec: 111 - count: 18162
File sharded in 2 shards
Downloading starting now, check your bandwidth speed (with bwm-ng)your cpu (with htop), and your disk usage (with iotop)!


In [64]:
def get_valid_file_ids(path:str) -> list[int]:
    """
    Return the ids of all files in IMAGES_PATH (recursively) that end with .jpg
    """
    files = glob.glob(path+"/*/*.jpg")
    files = [file.split('/')[-2:] for file in files]
    print(f"Found {len(files)} files")
    
    ids = [int(file[1].split('.')[0]) for file in files]
    ids.sort()
    return ids

In [65]:
ids = get_valid_file_ids(IMAGES_PATH)
ids

Found 12788 files


[0,
 1,
 2,
 3,
 5,
 6,
 7,
 8,
 9,
 11,
 13,
 14,
 15,
 16,
 17,
 19,
 22,
 24,
 26,
 27,
 28,
 29,
 31,
 33,
 34,
 35,
 38,
 40,
 43,
 44,
 45,
 46,
 47,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 59,
 61,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 73,
 75,
 76,
 78,
 80,
 83,
 85,
 86,
 88,
 89,
 91,
 93,
 94,
 96,
 97,
 99,
 100,
 101,
 102,
 104,
 105,
 106,
 107,
 109,
 111,
 112,
 113,
 114,
 115,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 128,
 131,
 134,
 135,
 136,
 137,
 138,
 139,
 142,
 144,
 147,
 148,
 149,
 150,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 163,
 164,
 166,
 167,
 169,
 170,
 171,
 172,
 174,
 176,
 178,
 179,
 180,
 181,
 182,
 183,
 185,
 186,
 190,
 191,
 192,
 193,
 196,
 197,
 199,
 201,
 202,
 203,
 204,
 205,
 206,
 207,
 208,
 212,
 213,
 215,
 217,
 218,
 221,
 222,
 223,
 224,
 225,
 229,
 230,
 231,
 233,
 236,
 237,
 238,
 239,
 242,
 243,
 244,
 245,
 247,
 249,
 250,
 252,
 254,
 255,
 256,
 257,
 260,
 268,
 269

In [66]:
data_with_images = data.iloc[ids]
data_with_images

Unnamed: 0,SAMPLE_ID,URL,TEXT,HEIGHT,WIDTH,LICENSE,NSFW,similarity
1,1.060015e+12,https://thumbs.ebaystatic.com/images/g/DYEAAOS...,Silverline Air Framing Nailer 90mm 10 - 12 Gau...,225.0,225.0,?,UNLIKELY,0.312485
2,3.372497e+12,https://farm1.staticflickr.com/784/40182677504...,Anhui Mountains,800.0,514.0,?,UNLIKELY,0.316512
3,3.820200e+11,https://t2.ftcdn.net/jpg/00/58/35/35/240_F_583...,Acute pain in a woman knee,257.0,240.0,?,UNLIKELY,0.344278
5,2.179119e+12,https://i.pinimg.com/236x/03/38/05/0338055833e...,Essentials Barnwood 70-inch TV Media Stand,236.0,236.0,?,UNLIKELY,0.332799
7,1.727450e+11,http://cdn.pastemagazine.com/www/articles/2011...,Ben Affleck Could Be Latest Addition To <em>Th...,320.0,320.0,?,UNLIKELY,0.353303
...,...,...,...,...,...,...,...,...
19994,4.063788e+12,https://publicauthordotcom.files.wordpress.com...,Letters over the Wall cover,205.0,300.0,?,UNLIKELY,0.303382
19995,1.530866e+12,https://www.digsdigs.com/photos/sweet-shabby-c...,Http Www Digsdigs Com 33 Sweet Shabby Chic Bed...,480.0,486.0,?,UNLIKELY,0.352294
19997,4.247173e+12,https://goalstudio.com/web/product/medium/2020...,TOTTENHAM 940 BALL CAP - GOLD,1100.0,1460.0,?,UNLIKELY,0.388634
19998,1.505120e+11,https://thumbs.dreamstime.com/m/clown-toy-colo...,Clown toy color vector illustration Royalty Fr...,92.0,130.0,?,UNLIKELY,0.349995


In [67]:
data_with_images = data_with_images.reset_index()
data_with_images

Unnamed: 0,index,SAMPLE_ID,URL,TEXT,HEIGHT,WIDTH,LICENSE,NSFW,similarity
0,1,1.060015e+12,https://thumbs.ebaystatic.com/images/g/DYEAAOS...,Silverline Air Framing Nailer 90mm 10 - 12 Gau...,225.0,225.0,?,UNLIKELY,0.312485
1,2,3.372497e+12,https://farm1.staticflickr.com/784/40182677504...,Anhui Mountains,800.0,514.0,?,UNLIKELY,0.316512
2,3,3.820200e+11,https://t2.ftcdn.net/jpg/00/58/35/35/240_F_583...,Acute pain in a woman knee,257.0,240.0,?,UNLIKELY,0.344278
3,5,2.179119e+12,https://i.pinimg.com/236x/03/38/05/0338055833e...,Essentials Barnwood 70-inch TV Media Stand,236.0,236.0,?,UNLIKELY,0.332799
4,7,1.727450e+11,http://cdn.pastemagazine.com/www/articles/2011...,Ben Affleck Could Be Latest Addition To <em>Th...,320.0,320.0,?,UNLIKELY,0.353303
...,...,...,...,...,...,...,...,...,...
12783,19994,4.063788e+12,https://publicauthordotcom.files.wordpress.com...,Letters over the Wall cover,205.0,300.0,?,UNLIKELY,0.303382
12784,19995,1.530866e+12,https://www.digsdigs.com/photos/sweet-shabby-c...,Http Www Digsdigs Com 33 Sweet Shabby Chic Bed...,480.0,486.0,?,UNLIKELY,0.352294
12785,19997,4.247173e+12,https://goalstudio.com/web/product/medium/2020...,TOTTENHAM 940 BALL CAP - GOLD,1100.0,1460.0,?,UNLIKELY,0.388634
12786,19998,1.505120e+11,https://thumbs.dreamstime.com/m/clown-toy-colo...,Clown toy color vector illustration Royalty Fr...,92.0,130.0,?,UNLIKELY,0.349995


In [68]:
write_urls(data_with_images, SUCCEEDED_URLS_PATH)
#download_images(URLS_PATH, IMAGES_PATH)

Finished writing 12788 URLs to /Users/yavuz/data/LAION-20000/succeeded-urls.txt


In [69]:
# save metadata to parquet
data_with_images.to_parquet(DATA_PATH)

In [73]:
def move_files(images_path: str):
    """
    Rename (and move files across shards) so that we have a continuous range of file names from 0 to n
    """
    files = glob.glob(IMAGES_PATH+"/*/*.jpg")
    files.sort()

    for i in range(0, len(files)):
        shard = str(i // 10000).zfill(5)
        index = str(i % 10000).zfill(4)
        
        image_file = files[i]
        json_file = image_file.replace(".jpg", ".json")
        
        shutil.move(image_file, f"{images_path}/{shard}/{shard}{index}.jpg")
        shutil.move(json_file, f"{images_path}/{shard}/{shard}{index}.json")

In [74]:
move_files(IMAGES_PATH)