# Setup

In [1]:
import os
import graze

Here we're just setting up some urls to play with. We choose to first transform these into tiny urls so they take less space!

In [2]:
_original_url1 = 'https://raw.githubusercontent.com/thorwhalen/graze/refs/heads/master/README.md'
_original_url2 = 'https://raw.githubusercontent.com/thorwhalen/graze/refs/heads/master/LICENSE'

url1 = graze.tiny_url(_original_url1)
url2 = graze.tiny_url(_original_url2)

print(f"{url1=}")
print(f"{url2=}")

url1='https://tinyurl.com/24vnnr29'
url2='https://tinyurl.com/288vf8rm'


In [3]:
# NOTE: If you say cache_rootdir=None, it won't use a temporary directory, but the 
# default cache directory on your system (whatever DFLT_GRAZE_DIR is set to).
cache_rootdir = False 

if cache_rootdir is False:
    # Get an empty temporary directory to use as our cache
    import tempfile
    
    cache_rootdir = tempfile.mkdtemp(prefix="graze_cache_")

print(f"Using cache rootdir: {cache_rootdir}")

Using cache rootdir: /var/folders/mc/c070wfh51kxd9lft8dl74q1r0000gn/T/graze_cache_25_ausil


# Basics

Note that even just g = Graze() will work, it will just use the default graze.DFLT_GRAZE_DIR location. This is practical to "just download and cache" files, and given the filepaths 
are automatically created from the urls, you still have **some** automatic separation 
in so far as the url structure separates (protocol, domain name, extension, etc.).  Here we are specifying our own cache root directory.

In [4]:
g = graze.Graze(cache_rootdir)
print(f"Graze cache directory: {g.rootdir}")

Graze cache directory: /var/folders/mc/c070wfh51kxd9lft8dl74q1r0000gn/T/graze_cache_25_ausil


List the items in your graze cache (you should have none if you used the temp folder as your root).

In [5]:
list(g)

[]

Now let's get the contents for `url1`. 

In [6]:
url1_contents = g[url1]
print(f"{len(url1_contents)} {type(url1_contents).__name__} ")

13134 bytes 


And now you should have oneitem in your cache, so `g` will list it.

In [16]:
list(g)

['https://tinyurl.com/24vnnr29']

Let's get those contents as before, but this time, it's taken from the cache
You won't really notice here, but you **will** notice if you're offline or dealing with a lot of data.

In [None]:
url1_contents = g[url1]
print(f"{len(url1_contents)} {type(url1_contents).__name__} ")

13134 bytes 


Now let's get the contents of `url2`.

In [9]:
url2_contents = g[url2]
print(f"{len(url2_contents)} {type(url2_contents).__name__} ")

1069 bytes 


Notice `g` lists the contents of your cache as urls.

In [10]:
list(g)

['https://tinyurl.com/288vf8rm', 'https://tinyurl.com/24vnnr29']

But you can see what the cache file is:

In [11]:
filepath = g.filepath_of(url1)
assert os.path.isfile(filepath)
print(f"{filepath}")

/var/folders/mc/c070wfh51kxd9lft8dl74q1r0000gn/T/graze_cache_25_ausil/https/tinyurl.com_f/24vnnr29


# Deleting cached data

If you want to refresh data from the source, you can delete the cached copy:

In [13]:
print(f"Before deletion: {len(g)} items")
print(f"url1 in cache: {url1 in g}")

# Delete url1 from cache
del g[url1]

print(f"\nAfter deleting url1: {len(g)} items")
print(f"url1 in cache: {url1 in g}")

# Next access will re-download
print("\nRe-accessing url1 (will download again)...")
url1_contents_again = g[url1]
print(f"url1 in cache: {url1 in g}")

Before deletion: 2 items
url1 in cache: True

After deleting url1: 1 items
url1 in cache: False

Re-accessing url1 (will download again)...
url1 in cache: True
url1 in cache: True


# Dict-like operations

Graze acts like a dictionary, so you can use familiar dict operations:

In [14]:
# Let's delete url2 so it's not in the cache anymore
del g[url2]

In [15]:
# Check if URL is in cache
print(f"Is url1 cached? {url1 in g}")
print(f"Is url2 cached? {url2 in g}")

# Get url2 to cache it
url2_contents = g[url2]
print(f"\nAfter fetching url2:")
print(f"Is url2 cached? {url2 in g}")

Is url1 cached? True
Is url2 cached? False

After fetching url2:
Is url2 cached? True


In [16]:
# Count cached items
print(f"Number of cached URLs: {len(g)}")

# Iterate over cached URLs
print("\nCached URLs:")
for url in g:
    print(f"  - {url}")

Number of cached URLs: 2

Cached URLs:
  - https://tinyurl.com/288vf8rm
  - https://tinyurl.com/24vnnr29


In [17]:
# Get keys, values, items
print("Keys (URLs):")
for key in g.keys():
    print(f"  {key}")

print("\nValues (first 50 bytes of each):")
for value in g.values():
    print(f"  {value[:50]}...")

print("\nItems (first 50 bytes of content):")
for key, value in g.items():
    print(f"  {key}: {value[:50]}...")

Keys (URLs):
  https://tinyurl.com/288vf8rm
  https://tinyurl.com/24vnnr29

Values (first 50 bytes of each):
  b'MIT License\n\nCopyright (c) [year] [fullname]\n\nPerm'...
  b'# graze\n\nCache (a tiny part of) the internet.\n\n(Fo'...

Items (first 50 bytes of content):
  https://tinyurl.com/288vf8rm: b'MIT License\n\nCopyright (c) [year] [fullname]\n\nPerm'...
  https://tinyurl.com/24vnnr29: b'# graze\n\nCache (a tiny part of) the internet.\n\n(Fo'...


# Working with filepaths

Sometimes you need the actual file path instead of the contents:

In [18]:
# Create a Graze instance that returns filepaths instead of contents
g_filepaths = graze.Graze(cache_rootdir, return_filepaths=True)

filepath1 = g_filepaths[url1]
print(f"Type: {type(filepath1).__name__}")
print(f"Filepath: {filepath1}")
print(f"File exists: {os.path.isfile(filepath1)}")

# Read the file yourself
with open(filepath1, 'rb') as f:
    contents = f.read()
print(f"\nFirst 100 bytes: {contents[:100]}")

Type: str
Filepath: /var/folders/mc/c070wfh51kxd9lft8dl74q1r0000gn/T/graze_cache_25_ausil/https/tinyurl.com_f/24vnnr29
File exists: True

First 100 bytes: b'# graze\n\nCache (a tiny part of) the internet.\n\n(For the technically inclined, `graze` is meant to ea'


# Key ingress callback

Get notified when graze is actually downloading from the internet:

In [19]:
# Create a callback that tracks downloads
downloads = []

def track_downloads(url):
    downloads.append(url)
    print(f"📥 Downloading: {url}")
    return url

# Create Graze with the callback
g_notify = graze.Graze(cache_rootdir, key_ingress=track_downloads)

# Clear cache to force re-download
if url1 in g_notify:
    del g_notify[url1]

print("Accessing url1 (not cached, will download):")
_ = g_notify[url1]

print("\nAccessing url1 again (cached, won't download):")
_ = g_notify[url1]

print(f"\nTotal downloads tracked: {len(downloads)}")
print(f"Downloaded URLs: {downloads}")

Accessing url1 (not cached, will download):
📥 Downloading: https://tinyurl.com/24vnnr29

Accessing url1 again (cached, won't download):

Total downloads tracked: 1
Downloaded URLs: ['https://tinyurl.com/24vnnr29']

Accessing url1 again (cached, won't download):

Total downloads tracked: 1
Downloaded URLs: ['https://tinyurl.com/24vnnr29']


# Using the graze() function

For one-off fetches, you can use the `graze()` function directly:

In [20]:
# Simple usage - uses default cache location
contents = graze.graze(url1)
print(f"Got {len(contents)} bytes")
print(f"First 100 bytes: {contents[:100]}")

# Specify cache location
contents = graze.graze(url2, cache=cache_rootdir)
print(f"\nGot {len(contents)} bytes from url2")

# Get filepath instead of contents
filepath = graze.graze(url1, cache=cache_rootdir, return_key=True)
print(f"\nFilepath: {filepath}")

Got 13134 bytes
First 100 bytes: b'# graze\n\nCache (a tiny part of) the internet.\n\n(For the technically inclined, `graze` is meant to ea'

Got 1069 bytes from url2

Filepath: /var/folders/mc/c070wfh51kxd9lft8dl74q1r0000gn/T/graze_cache_25_ausil/https/tinyurl.com_f/24vnnr29


# Custom cache backends

Graze can use any MutableMapping as a cache backend, including a simple dict for in-memory caching:

In [21]:
# Use a dict as cache (in-memory, no persistence)
my_cache = {}
g_dict = graze.Graze(my_cache)

# Access some URLs
_ = g_dict[url1]
_ = g_dict[url2]

print(f"Cache is a {type(my_cache).__name__}")
print(f"Number of items in dict cache: {len(my_cache)}")
print(f"Keys in cache: {list(my_cache.keys())}")

# You can also inspect the dict directly
for key in my_cache:
    print(f"  {key}: {len(my_cache[key])} bytes")

Cache is a dict
Number of items in dict cache: 2
Keys in cache: ['https/tinyurl.com_f/24vnnr29', 'https/tinyurl.com_f/288vf8rm']
  https/tinyurl.com_f/24vnnr29: 13134 bytes
  https/tinyurl.com_f/288vf8rm: 1069 bytes


# Some other useful graze utils

## tiny_url

In [3]:
from graze import tiny_url

url = 'https://raw.githubusercontent.com/thorwhalen/graze/refs/heads/master/README.md'
little_url = tiny_url(url)
print(little_url)

https://tinyurl.com/24vnnr29


In the language of codecs, one could say you are "encoding" the url. 
We made `tiny_url` a codec with `encode` and `decode` attributes you can use. 
See that `tiny_url.encode(url)` is equivalent to `tiny_url(url)`.

In [None]:
encoded_url = tiny_url.encode(url)
assert encoded_url == little_url
print(encoded_url)

https://tinyurl.com/24vnnr29


And `tiny_url.decode(encoded_url)` will tell you what a previously encoded `url` was.

In [5]:
decoded_url = tiny_url.decode(encoded_url)
assert decoded_url == url
print(decoded_url)

https://raw.githubusercontent.com/thorwhalen/graze/refs/heads/master/README.md
