In [None]:
%load_ext lab_black

In [None]:
from time import time
from pathlib import Path
from pickle import dumps, loads
from sqlitedict import SqliteDict as sqldict

In [None]:
data = "pickey"
Path("pickey").mkdir(exist_ok=True)

In [None]:
# Let's count from 0 to 9 in Python
for i in range(10):
    print(i, end=" ")

In [None]:
# Let's count from 1 to 10 in Python
for i in range(1, 11):
    print(i, end=" ")

In [None]:
# Keep it easy. Just do this.
for i in range(10):
    print(i + 1, end=" ")

In [None]:
# Count to 100,000 by 1,000.
for i in range(100000):
    if not i % 1000:
        print(i, end=" ")

In [None]:
# Can you format that with commas?
for i in range(100000):
    if not i % 1000:
        print(f"{i:,}", end=" ")

In [None]:
# What's it like to write a million lines into a text file?
filename = f"{data}/text.txt"
with open(filename, "wt") as fh:
    for i in range(1000000):
        fh.write(f"{i}\n")
print("Done")

In [None]:
# That was fast. How big is that file?
bytesize = Path(filename).stat().st_size
kilo = 1000
print(f"The file {filename} is {bytesize:,} Bytes")
print(f"Abbreviated to {bytesize / kilo:,.0f} Kilobytes")  # The :,0f formats
print(f"Or just {bytesize / kilo / kilo:.0f}K")

In [None]:
# What's it like to write 100,000 keys into a database?
filename = f"{data}/database.db"
now = time()
with sqldict(filename) as db:
    for i in range(100000):
        db[i] = None
        if not i % 10000:
            db.commit()
            print(f"{i:,}", end=" ")
seconds = int(time() - now)
print(f"\nDone ({seconds} seconds)")

In [None]:
# That was fast. How big is that file?
bytesize = Path(filename).stat().st_size
kilo = 1000
f"{bytesize:,} Bytes"
f"The file {filename} is {bytesize / kilo:,.0f} Kilobytes"

In [None]:
# How many zeros in a billion?
f"{10**9:,}"

In [None]:
# How many zeros in a hundred-million?
f"{10**8:,}"

In [None]:
# Let's count down from a billion by hundred-millions
hundredmillion = 10**8
billion = 10**9
now = time()
for i in range(billion):
    if not i % hundredmillion:
        print(f"{i:,}", end="  ")
seconds = int(time() - now)
print(f"\nDone ({seconds} seconds)")  # Computers are fast but not that fast

In [None]:
# Make an in-memory dict with a hundred million keys
seen = set()
million = 10**6
for i in range(million):
    seen.add(i)
print(f"Made {len(seen):,} keys.")

# Dump pickled set to file
filename = f"{data}/dumps.pkl"
kilo = 1000
with open(filename, "wb") as fh:
    fh.write(dumps(seen))
print(f"Saved {filename} to drive.")

# Report size of file
bytesize = Path(filename).stat().st_size
print(f"{filename} is {bytesize / kilo:,.0f} Kilobytes")

# Load picled set out of file"
with open(filename, "rb") as fh:
    seen = loads(fh.read())
print(f"Read the {type(seen)} back off of disk.")

print("Done")

In [None]:
from json import loads as js
from PIL import Image
from httpx import get
from io import BytesIO
from time import sleep
from pathlib import Path
from pickle import loads, dumps
from imagehash import phash, whash
from IPython.display import display
from PIL.PngImagePlugin import PngInfo

data = "pickey"
save_to = f"{data}/cats"
thumbs = f"{data}/thumbs"

Path(save_to).mkdir(exist_ok=True)
Path(thumbs).mkdir(exist_ok=True)

In [None]:
# Download 30 cats that don't exist
url = "https://thiscatdoesnotexist.com/"
cats = 30
for i in range(cats):
    filename = f"{save_to}/cat-{str(i).zfill(3)}.jpg"
    if not Path(filename).exists():
        print(f"{cats - i} Downloading: {filename}")
        response = get(url)
        img = Image.open(BytesIO(response.content))
        img.save(filename)
        sleep(1)
print("Done")

In [None]:
size = 64

# Load already pickeled cats as a set.
pickled_cats = f"{data}/seencats.pkl"
if Path(pickled_cats).exists():
    with open(pickled_cats, "rb") as fh:
        seen = loads(fh.read())
else:
    seen = set()

# Make thumbnails of cat pics.
for cat in Path(save_to).glob("*.jpg"):
    img = Image.open(cat)
    thumb = img.copy()
    thumb.thumbnail((size, size))
    awhash = whash(img, hash_size=8)
    width, height = img.width, img.height
    bands = "".join(img.getbands())
    # extremes = (
    #     "!".join([str(x)[1:-1] for x in img.getextrema()])
    #     .replace(" ", "")
    #     .replace(",", "-")
    # ).replace("!", ",")
    meta_data = {
        "filename": cat.name,
        "width": width,
        "height": height,
        "format": img.format,
        "format_description": img.format_description,
        "bands": img.getbands(),
        "extremes": img.getextrema(),
        "xmp": img.getxmp(),
    }
    pi = PngInfo()
    for meta in meta_data:
        pi.add_text(meta, f"{meta_data[meta]}")
    # Make filename using perceptual image hash.
    #filename = f"{width}x{height}_{bands}_{extremes}_{awhash}_.png"
    filename = f"{width}x{height}_{awhash}_.png"
    if filename not in seen:
        print(cat)
        display(thumb)
        seen.add(filename)
        print(filename)
        thumb.save(
            f"{thumbs}/{filename}",
            "PNG",
            pnginfo=pi,
            save_all=True,
        )
        print()
with open(pickled_cats, "wb") as fh:
    fh.write(dumps(seen))

# Report size of file
bytesize = Path(pickled_cats).stat().st_size
print(f"{pickled_cats} is {bytesize:,} Bytes")

print("Done")

In [None]:
# Notice how some cats are more hexed than others. 
for cat in Path(thumbs).glob("*.png"):
    name = cat.name
    parts = name.split("_")
    whash = parts[3]
    print(whash, hex(int(whash, 16)))

In [None]:
# The meta data is still in the PNG thumnails.
for cat in Path(thumbs).glob("*.png"):
    print(cat)
    img = Image.open(cat)
    meta = img.text
    for key in meta:
        print(f"{key}: {meta[key]}")
    print()