In [1]:
%reload_ext autoreload
%autoreload 2

We'll add the project to PYTHON_PATH so we can import the modules from the project.

In [2]:
import sys, os
proj = os.path.dirname(os.getcwd())
if proj not in sys.path:
    sys.path.insert(0, proj)
sys.path

['/Volumes/Projects/Evidently/record-thing',
 '/opt/homebrew/Caskroom/miniconda/base/lib/python311.zip',
 '/opt/homebrew/Caskroom/miniconda/base/lib/python3.11',
 '/opt/homebrew/Caskroom/miniconda/base/lib/python3.11/lib-dynload',
 '',
 '/opt/homebrew/Caskroom/miniconda/base/lib/python3.11/site-packages']

In case we use PyTorch, we'll need to have a `device` variable to handle the device where the model will be trained.
MPS support is commented out as it isn't fully supported in the DINO model that we use.

In [3]:
# Device independent code
import torch

device = "cpu" 
if torch.cuda.is_available():
    device = "cuda"
# if torch.backends.mps.is_available:
#     device = torch.device("mps")
torch.device(device)
torch.set_default_device(device=device)

os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"

### RQL Server

The RQL server is started in the background within a Docker container.

In [4]:
!docker compose up -d

/Users/henrikvendelbo/.zshenv:.:1: no such file or directory: /Users/henrikvendelbo/.cargo/env
[1A[1B[0G[?25l[+] Running 1/0
 [32m✔[0m Container rqlite_server  [32mRunning[0m                                        [34m0.0s [0m
[?25h[1A[1A[0G[?25l[34m[+] Running 1/1[0m
 [32m✔[0m Container rqlite_server  [32mRunning[0m                                        [34m0.0s [0m
[?25h[1A[1A[0G[?25l[34m[+] Running 1/1[0m
 [32m✔[0m Container rqlite_server  [32mRunning[0m                                        [34m0.0s [0m
[?25h[1A[1A[0G[?25l[34m[+] Running 1/1[0m
 [32m✔[0m Container rqlite_server  [32mRunning[0m                                        [34m0.0s [0m
[?25h[1A[1A[0G[?25l[34m[+] Running 1/1[0m
 [32m✔[0m Container rqlite_server  [32mRunning[0m                                        [34m0.0s [0m
[?25h[1A[1A[0G[?25l[34m[+] Running 1/1[0m
 [32m✔[0m Container rqlite_server  [32mRunning[0m                                 

In [5]:
!curl 127.0.0.1:4001/status

/Users/henrikvendelbo/.zshenv:.:1: no such file or directory: /Users/henrikvendelbo/.cargo/env
{"build":{"branch":"master","build_time":"","commit":"8cea072605b6accbfc1607553011d33ef7bb6f87","compiler_command":"musl-gcc","compiler_toolchain":"gc","version":"v8.34.1"},"cluster":{"addr":"bfad3c2a7d3d:4002","api_addr":"bfad3c2a7d3d:4001","https":"false"},"extensions":{"dir":"/rqlite/file/data/extensions","names":["icu.so","sqlean.so","vec0.so"]},"http":{"auth":"enabled","bind_addr":"[::]:4001","cluster":{"local_node_addr":"bfad3c2a7d3d:4002","timeout":"30s"},"queue":{"_default":{"batch_size":128,"max_size":1024,"sequence_number":0,"timeout":"50ms"}},"tls":{"enabled":"false"}},"mux":{"addr":"bfad3c2a7d3d:4002","handlers":"\u0001\u0002","timeout":"30s","tls":"disabled"},"network":{"interfaces":{"eth0":{"flags":"up|broadcast|multicast|running","hardware_address":"02:42:ac:12:00:02","addresses":[{"address":"172.18.0.2/16"}]},"ip6tnl0":{"flags":"0","hardware_address":"","addresses":null},"lo":

In [None]:
# from dataset.models import ClipAsset, Account
from dataset.db import init_db_rqlite

connection = init_db_rqlite(disconnect=False)

Inspecting data

### Account

In [None]:
from dataset.db.rql import connection
from dataset.commons import commons
import pandas as pd

with connection.cursor() as cursor:
    count = cursor.execute(
    """
      SELECT COUNT(*) FROM accounts
    """
            ).fetchone()

    display(f"account count: {count[0]}")

    rows = cursor.execute(
    """
      SELECT id, name, email, sms, region FROM accounts
      LIMIT 30
    """
            ).fetchall()
    
    display("commons.account_id:", commons['account_id'])
    display(pd.DataFrame(rows, columns=["id", "name", "email", "sms", "region"]))

'account count: 1'

'commons.account_id:'

'2pg7CwH6RrdHuvtY1Z3lsDuBVSV'

Unnamed: 0,id,name,email,sms,region
0,2pg7CwH6RrdHuvtY1Z3lsDuBVSV,Joe Schmoe,,,


### Clip Assets

In [None]:
from dataset.db.rql import connection
import pandas as pd

with connection.cursor() as cursor:
    count = cursor.execute(
    """
      SELECT COUNT(*) FROM clip_assets
    """
            ).fetchone()

    display(f"clip_assets count: {count[0]}")

    rows = cursor.execute(
    """
      SELECT clip_assets.id, clip_assets.name FROM clip_assets
      INNER JOIN dino_embedding ON dino_embedding.asset_id = clip_assets.id
      LIMIT 30
    """
            ).fetchall()


    display(pd.DataFrame(rows, columns=["embedding", "distance", "name"]))

'clip_assets count: 0'

Unnamed: 0,embedding,distance,name


### Products

In [None]:
from dataset.db.rql import connection
import pandas as pd

with connection.cursor() as cursor:
    count = cursor.execute(
    """
      SELECT COUNT(*) FROM products
    """
            ).fetchone()

    display(f"clip_assets count: {count[0]}")

    rows = cursor.execute(
    """
      SELECT id, upc, asin, elid, brand, model, color, tags, category, title, description, name FROM product
      LIMIT 30
    """
            ).fetchall()


    display(pd.DataFrame(rows, columns=["id", "upc","asin","elid","brand","model","color","tags","category","title","description", "name"]))


'clip_assets count: 0'

Unnamed: 0,id,upc,asin,elid,brand,model,color,tags,category,title,description,name
