## Scan the data folder
We downloaded a bunch of assets to the data folder. Let's see what we have and compile them into a single `.json` dataset file. This way, we can retrerive the data from a single file. For simplicity, we will just gather the information about `images` because it's easier to show on a website.

In [None]:
import json
import os

urls_with_snapshots = json.load(open("urls_with_snapshots.json"))
print("Loaded", len(urls_with_snapshots), "urls with snapshots")
print("Example:", urls_with_snapshots[0])

### Find assets from the `data` folder

In [None]:
assets = []
DATA_DIR = "example-data"
ASSETS_FOLDER_NAMES = ["images"]

# walk through all the files in the data directory
for root, dirs, files in os.walk(DATA_DIR):
    for file in files:
        # see if the enclosing folder is one of the assets folders
        folder = os.path.basename(root)
        if folder in ASSETS_FOLDER_NAMES:
            # add the file to the list of assets
            assets.append(os.path.join(root, file))

print("Found", len(assets), "assets in", ASSETS_FOLDER_NAMES)

In [None]:
import hashlib


def md5_from_file(file_path):
    hash_md5 = hashlib.md5()
    with open(file_path, "rb") as f:
        for chunk in iter(lambda: f.read(4096), b""):
            hash_md5.update(chunk)
    return hash_md5.hexdigest()


assets_metadata_list = []

for asset in assets:
    # enclosing folder is the asset type
    print("asset:", asset)
    asset_type = os.path.basename(os.path.dirname(asset))
    print("asset_type:", asset_type)

    # encloseing enclosing folder is the timestamp
    timestamp = os.path.basename(os.path.dirname(os.path.dirname(asset)))
    print("timestamp:", timestamp)

    # encloseing enclosing enclosing folder is the url id
    url_id = os.path.basename(os.path.dirname(os.path.dirname(os.path.dirname(asset))))
    print("url_id:", url_id)

    url = None
    # find the url for the url id in the urls_with_snapshots list
    for url_w_s in urls_with_snapshots:
        if url_w_s["id"] == url_id:
            url = url_w_s["url"]
            break

    print("url:", url)

    file_md5 = md5_from_file(asset)
    print("file_md5:", file_md5)

    asset_metadata = {
        "url": url,
        "url_id": url_id,
        "md5": file_md5,
        "type": asset_type,
        "path": asset,
    }

    assets_metadata_list.append(asset_metadata)

print("Example:", assets_metadata_list[0])

# save the list of assets to a file
json.dump(assets_metadata_list, open("assets.json", "w"), indent=2)