In [119]:
"""
Check that the prepared dataset=(metadata parquet, images) is consistent.
"""

In [None]:
import pandas as pd
import os
import json

In [120]:
DATASET_ENTITY_COUNT = 20000

PREP_DATASET_PATH = f"/Users/yavuz/data/LAION-{DATASET_ENTITY_COUNT}/"
METADATA_PATH = PREP_DATASET_PATH + "metadata.parquet"
IMAGES_PATH = PREP_DATASET_PATH + "images"

In [121]:
def read_saved_data(path: str) -> pd.DataFrame:
    """
    Return a dataset saved previously.
    """
    df = pd.read_parquet(path)
    print(f"Read {len(df)} entries from {path}.")
    return df

In [122]:
data = read_saved_data(METADATA_PATH)
data

Read 12788 entries from /Users/yavuz/data/LAION-20000/metadata.parquet.


Unnamed: 0,index,SAMPLE_ID,URL,TEXT,HEIGHT,WIDTH,LICENSE,NSFW,similarity
0,1,1.060015e+12,https://thumbs.ebaystatic.com/images/g/DYEAAOS...,Silverline Air Framing Nailer 90mm 10 - 12 Gau...,225.0,225.0,?,UNLIKELY,0.312485
1,2,3.372497e+12,https://farm1.staticflickr.com/784/40182677504...,Anhui Mountains,800.0,514.0,?,UNLIKELY,0.316512
2,3,3.820200e+11,https://t2.ftcdn.net/jpg/00/58/35/35/240_F_583...,Acute pain in a woman knee,257.0,240.0,?,UNLIKELY,0.344278
3,5,2.179119e+12,https://i.pinimg.com/236x/03/38/05/0338055833e...,Essentials Barnwood 70-inch TV Media Stand,236.0,236.0,?,UNLIKELY,0.332799
4,7,1.727450e+11,http://cdn.pastemagazine.com/www/articles/2011...,Ben Affleck Could Be Latest Addition To <em>Th...,320.0,320.0,?,UNLIKELY,0.353303
...,...,...,...,...,...,...,...,...,...
12783,19994,4.063788e+12,https://publicauthordotcom.files.wordpress.com...,Letters over the Wall cover,205.0,300.0,?,UNLIKELY,0.303382
12784,19995,1.530866e+12,https://www.digsdigs.com/photos/sweet-shabby-c...,Http Www Digsdigs Com 33 Sweet Shabby Chic Bed...,480.0,486.0,?,UNLIKELY,0.352294
12785,19997,4.247173e+12,https://goalstudio.com/web/product/medium/2020...,TOTTENHAM 940 BALL CAP - GOLD,1100.0,1460.0,?,UNLIKELY,0.388634
12786,19998,1.505120e+11,https://thumbs.dreamstime.com/m/clown-toy-colo...,Clown toy color vector illustration Royalty Fr...,92.0,130.0,?,UNLIKELY,0.349995


In [123]:
def verify_images(image_path: str, data: pd.DataFrame) -> bool:
    """
    Verify that the images at the image_path is consistent with the data frame.
    """
    # iterate over the URLS of the data frame
    for i, row in data.iterrows():
        shard = str(i // 10000).zfill(5)
        index = str(i % 10000).zfill(4)
        
        #check if image exists
        image_file = f"{image_path}/{shard}/{shard}{index}.jpg"
        if not os.path.exists(image_file):
            print(f"Image {image_file} does not exist for this row: {index, row}")
            return False
        
        #check if json exists
        json_file = f"{image_path}/{shard}/{shard}{index}.json"
        if not os.path.exists(json_file):
            print(f"Json {json_file} does not exist for this row: {index, row}")
            return False

        
        #compare the data URL with the json URL
        if row["URL"] != json.load(open(json_file))["url"]:
            print(f"Error: URL does not match for this index, row: {index, row}")
            print("Image file: ", image_file)
            print("Json file: ", image_file)
            return False

    print("All images and json files are verified.")  
    return True

In [124]:
verify_images(IMAGES_PATH, data)

All images and json files are verified.


True